## 0.环境准备

In [1]:
import re
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, RandomForestRegressionModel
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import RegressionMetrics

## 1.问题背景与数据

<div><div class="markdown-converter__text--rendered"><p>When you’ve been devastated by a serious car accident, your focus is on the things that matter the most: family, friends, and other loved ones. Pushing paper with your insurance agent is the last place you want your time or mental energy spent. This is why <a href="https://www.allstate.com/" target="_blank" rel="nofollow">Allstate</a>, a personal insurer in the United States, is continually seeking fresh ideas to improve their claims service for the over 16 million households they protect.</p>
<p><img src="https://kaggle2.blob.core.windows.net/competitions/kaggle/5325/media/allstate_banner-660x120.png" alt="" width="660"></p>
<p>Allstate is currently developing automated methods of predicting the cost, and hence severity, of claims. In this recruitment challenge, Kagglers are invited to show off their creativity and flex their technical chops by creating&nbsp;an algorithm which accurately predicts claims severity. Aspiring competitors will demonstrate&nbsp;insight into better ways to predict claims severity for the chance to be part of Allstate’s efforts to ensure a worry-free customer experience.</p>
<p>New to Kaggle? This competition is a recruiting competition, your chance to get a foot in the door with the hiring team at Allstate.</p></div></div>

In [3]:
import pandas as pd
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [4]:
data.columns

Index([u'id', u'cat1', u'cat2', u'cat3', u'cat4', u'cat5', u'cat6', u'cat7',
       u'cat8', u'cat9',
       ...
       u'cont6', u'cont7', u'cont8', u'cont9', u'cont10', u'cont11', u'cont12',
       u'cont13', u'cont14', u'loss'],
      dtype='object', length=132)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188318 entries, 0 to 188317
Columns: 132 entries, id to loss
dtypes: float64(15), int64(1), object(116)
memory usage: 189.7+ MB


In [6]:
data.describe()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
count,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0
mean,294135.982561,0.493861,0.507188,0.498918,0.491812,0.487428,0.490945,0.48497,0.486437,0.485506,0.498066,0.493511,0.49315,0.493138,0.495717,3037.337686
std,169336.084867,0.18764,0.207202,0.202105,0.211292,0.209027,0.205273,0.17845,0.19937,0.18166,0.185877,0.209737,0.209427,0.212777,0.222488,2904.086186
min,1.0,1.6e-05,0.001149,0.002634,0.176921,0.281143,0.012683,0.069503,0.23688,8e-05,0.0,0.035321,0.036232,0.000228,0.179722,0.67
25%,147748.25,0.34609,0.358319,0.336963,0.327354,0.281143,0.336105,0.350175,0.3128,0.35897,0.36458,0.310961,0.311661,0.315758,0.29461,1204.46
50%,294539.5,0.475784,0.555782,0.527991,0.452887,0.422268,0.440945,0.438285,0.44106,0.44145,0.46119,0.457203,0.462286,0.363547,0.407403,2115.57
75%,440680.5,0.623912,0.681761,0.634224,0.652072,0.643315,0.655021,0.591045,0.62358,0.56682,0.61459,0.678924,0.675759,0.689974,0.724623,3864.045
max,587633.0,0.984975,0.862654,0.944251,0.954297,0.983674,0.997162,1.0,0.9802,0.9954,0.99498,0.998742,0.998484,0.988494,0.844848,121012.25


## 2.spark建模与训练

In [7]:
sparkSession = (SparkSession.builder
  .appName("AllstateClaimsSeverityRandomForestRegressor")
  .getOrCreate())

In [9]:
print("读取与载入数据...")
trainInput = (sparkSession.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv("train.csv")
  .cache())

testInput = (sparkSession.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv("test.csv")
  .cache())
print("数据载入完毕...")

读取与载入数据...
数据载入完毕...


In [10]:
data = trainInput.withColumnRenamed("loss", "label")
[trainingData, validationData] = data.randomSplit([0.7, 0.3])
trainingData.cache()
validationData.cache()

DataFrame[id: int, cat1: string, cat2: string, cat3: string, cat4: string, cat5: string, cat6: string, cat7: string, cat8: string, cat9: string, cat10: string, cat11: string, cat12: string, cat13: string, cat14: string, cat15: string, cat16: string, cat17: string, cat18: string, cat19: string, cat20: string, cat21: string, cat22: string, cat23: string, cat24: string, cat25: string, cat26: string, cat27: string, cat28: string, cat29: string, cat30: string, cat31: string, cat32: string, cat33: string, cat34: string, cat35: string, cat36: string, cat37: string, cat38: string, cat39: string, cat40: string, cat41: string, cat42: string, cat43: string, cat44: string, cat45: string, cat46: string, cat47: string, cat48: string, cat49: string, cat50: string, cat51: string, cat52: string, cat53: string, cat54: string, cat55: string, cat56: string, cat57: string, cat58: string, cat59: string, cat60: string, cat61: string, cat62: string, cat63: string, cat64: string, cat65: string, cat66: string, 

In [11]:
testData = testInput.cache()

In [13]:
print("数据与特征处理...")
print("对类别型的特征进行处理...")
# 对类别型的列用StringIndexer或者OneHotEncoder
isCateg     = lambda c: c.startswith("cat")
categNewCol = lambda c: "idx_{0}".format(c) if (isCateg(c)) else c

stringIndexerStages = map(lambda c: StringIndexer(inputCol=c, outputCol=categNewCol(c))
    .fit(trainInput.select(c).union(testInput.select(c))), filter(isCateg, trainingData.columns))

# 干掉特别特别多类别的列(类似ID列)
removeTooManyCategs = lambda c: not re.match(r"cat(109$|110$|112$|113$|116$)", c)

# 只保留特征列
onlyFeatureCols = lambda c: not re.match(r"id|label", c)

# 用上述函数进行过滤
featureCols = map(categNewCol, 
                  filter(onlyFeatureCols, 
                         filter(removeTooManyCategs, 
                                trainingData.columns)))

# 组装特征
assembler = VectorAssembler(inputCols=featureCols, outputCol="features")
print("特征生成和组装完毕...")

构建机器学习流程...
对类别型的特征进行处理...


In [14]:
print("构建随机森林进行回归预测...")
#使用随机森林进行回归
algo = RandomForestRegressor(featuresCol="features", labelCol="label")

stages = stringIndexerStages
stages.append(assembler)
stages.append(algo)

#构建pipeline
pipeline = Pipeline(stages=stages)

构建随机森林进行回归预测...


In [18]:
print("K折交叉验证...")
numTrees = [5, 20]
maxDepth = [4, 6]
maxBins = [32]
numFolds = 3

paramGrid = (ParamGridBuilder()
  .addGrid(algo.numTrees, numTrees)
  .addGrid(algo.maxDepth, maxDepth)
  .addGrid(algo.maxBins, maxBins)
  .build())

cv = CrossValidator(estimator=pipeline,
                    evaluator=RegressionEvaluator(),
                    estimatorParamMaps=paramGrid,
                    numFolds=numFolds)

cvModel = cv.fit(trainingData)

K折交叉验证...


In [32]:
trainPredictionsAndLabels = cvModel.transform(trainingData).select("label", "prediction").rdd

validPredictionsAndLabels = cvModel.transform(validationData).select("label", "prediction").rdd

trainRegressionMetrics = RegressionMetrics(trainPredictionsAndLabels)
validRegressionMetrics = RegressionMetrics(validPredictionsAndLabels)

bestModel = cvModel.bestModel
featureImportances = bestModel.stages[-1].featureImportances.toArray()

print("TrainingData count: {0}".format(trainingData.count()))
print("ValidationData count: {0}".format(validationData.count()))
print("TestData count: {0}".format(testData.count()))
print("=====================================================================")
print("Param algoNumTrees = {0}".format(",".join(map(lambda x:str(x), numTrees))))
print("Param algoMaxDepth = {0}".format(",".join(map(lambda x:str(x), maxDepth))))
print("Param algoMaxBins = {0}".format(",".join(map(lambda x:str(x), maxBins))))
print("Param numFolds = {0}".format(numFolds))
print("=====================================================================\n")
print("Training data MSE = {0}".format(trainRegressionMetrics.meanSquaredError))
print("Training data RMSE = {0}".format(trainRegressionMetrics.rootMeanSquaredError))
print("Training data R-squared = {0}".format(trainRegressionMetrics.r2))
print("Training data MAE = {0}".format(trainRegressionMetrics.meanAbsoluteError))
print("Training data Explained variance = {0}".format(trainRegressionMetrics.explainedVariance))
print("=====================================================================\n")
print("Validation data MSE = {0}".format(validRegressionMetrics.meanSquaredError))
print("Validation data RMSE = {0}".format(validRegressionMetrics.rootMeanSquaredError))
print("Validation data R-squared = {0}".format(validRegressionMetrics.r2))
print("Validation data MAE = {0}".format(validRegressionMetrics.meanAbsoluteError))
print("Validation data Explained variance = {0}".format(validRegressionMetrics.explainedVariance))
print("=====================================================================\n")
print("特征重要度:\n{0}\n".format("\n".join(map(lambda z: "{0} = {1}".format(str(z[0]),str(z[1])), zip(featureCols, featureImportances)))))

TrainingData count: 132043
ValidationData count: 56275
TestData count: 125546
Param algoNumTrees = 5,20
Param algoMaxDepth = 4,6
Param algoMaxBins = 32
Param numFolds = 3

Training data MSE = 4410411.47687
Training data RMSE = 2100.0979684
Training data R-squared = -0.447957407799
Training data MAE = 1361.58773125
Training data Explained variance = 8523087.68135

Validation data MSE = 4544385.12372
Validation data RMSE = 2131.75634718
Validation data R-squared = -0.485534564359
Validation data MAE = 1366.44720981
Validation data Explained variance = 8223816.93435

特征重要度:
idx_cat1 = 0.00401212941648
idx_cat2 = 0.000594375474857
idx_cat3 = 0.000650983167885
idx_cat4 = 9.77918316701e-05
idx_cat5 = 3.42279819768e-05
idx_cat6 = 0.000548093133661
idx_cat7 = 0.0131252707706
idx_cat8 = 6.06947043282e-06
idx_cat9 = 0.00167677609767
idx_cat10 = 0.0108027265111
idx_cat11 = 8.25810418802e-05
idx_cat12 = 0.0475965428577
idx_cat13 = 0.00201367189686
idx_cat14 = 4.75431292666e-05
idx_cat15 = 0.0
idx_

In [34]:
cvModel.transform(testData)\
.select("id", "prediction")\
.withColumnRenamed("prediction", "loss")\
.coalesce(1)\
.write.format("csv")\
.option("header", "true")\
.save("rf_sub.csv")

### 用GBDT拟合

In [None]:
from pyspark.ml.regression import GBTRegressor, GBTRegressionModel
algo2 = GBTRegressor(featuresCol="features", labelCol="label")

stages2 = stringIndexerStages
stages2.append(assembler)
stages2.append(algo2)

pipeline2 = Pipeline(stages=stages2)

print("K折交叉验证...")
numTrees = [5, 20]
maxDepth = [4, 6]
maxBins = [32]
numFolds = 3

paramGrid = (ParamGridBuilder()
  .addGrid(algo.numTrees, numTrees)
  .addGrid(algo.maxDepth, maxDepth)
  .addGrid(algo.maxBins, maxBins)
  .build())

cv = CrossValidator(estimator=pipeline2,
                    evaluator=RegressionEvaluator(),
                    estimatorParamMaps=paramGrid,
                    numFolds=numFolds)

cvModel = cv.fit(trainingData)

In [None]:
trainPredictionsAndLabels = cvModel.transform(trainingData).select("label", "prediction").rdd

validPredictionsAndLabels = cvModel.transform(validationData).select("label", "prediction").rdd

trainRegressionMetrics = RegressionMetrics(trainPredictionsAndLabels)
validRegressionMetrics = RegressionMetrics(validPredictionsAndLabels)

bestModel = cvModel.bestModel
featureImportances = bestModel.stages[-1].featureImportances.toArray()

print("TrainingData count: {0}".format(trainingData.count()))
print("ValidationData count: {0}".format(validationData.count()))
print("TestData count: {0}".format(testData.count()))
print("=====================================================================")
print("Param algoNumTrees = {0}".format(",".join(map(lambda x:str(x), numTrees))))
print("Param algoMaxDepth = {0}".format(",".join(map(lambda x:str(x), maxDepth))))
print("Param algoMaxBins = {0}".format(",".join(map(lambda x:str(x), maxBins))))
print("Param numFolds = {0}".format(numFolds))
print("=====================================================================\n")
print("Training data MSE = {0}".format(trainRegressionMetrics.meanSquaredError))
print("Training data RMSE = {0}".format(trainRegressionMetrics.rootMeanSquaredError))
print("Training data R-squared = {0}".format(trainRegressionMetrics.r2))
print("Training data MAE = {0}".format(trainRegressionMetrics.meanAbsoluteError))
print("Training data Explained variance = {0}".format(trainRegressionMetrics.explainedVariance))
print("=====================================================================\n")
print("Validation data MSE = {0}".format(validRegressionMetrics.meanSquaredError))
print("Validation data RMSE = {0}".format(validRegressionMetrics.rootMeanSquaredError))
print("Validation data R-squared = {0}".format(validRegressionMetrics.r2))
print("Validation data MAE = {0}".format(validRegressionMetrics.meanAbsoluteError))
print("Validation data Explained variance = {0}".format(validRegressionMetrics.explainedVariance))
print("=====================================================================\n")
print("特征重要度:\n{0}\n".format("\n".join(map(lambda z: "{0} = {1}".format(str(z[0]),str(z[1])), zip(featureCols, featureImportances)))))