In [12]:
from pyspark import SparkContext, SparkConf

sc.stop()
conf = SparkConf().setAppName("spark_mlib")
sc = SparkContext(conf=conf)

file = 'file:/home/hadoop/PycharmProjects/pythonProject/data/'

rawData = sc.textFile(file+'hour.csv')
header = rawData.first()
rData = rawData.filter(lambda x: x != header)

lines = rData.map(lambda x: x.split(","))
print("共有： " + str(lines.count()) + "项数据")


共有： 17379项数据


In [14]:
# 处理特征
import numpy as np

def convert_float(v):
    """处理数值，将字符串转为float"""
    return float(v)

def process_features(line):
    """处理特征，line为字段行"""
    SeasonFeature = [convert_float(value) for value in line[2]]
    ## 处理余下的特征
    Features = [convert_float(value) for value in line[4: 14]]
    
    return Features

In [17]:
# 处理预测目标值
def process_label(line):
    return float(line[-1])
process_label(lines.first())

16.0

In [19]:
# 构建LabelPoint数据
from pyspark.mllib.regression import LabeledPoint

labelpointRDD = lines.map(lambda r: LabeledPoint(process_label(r), \
                                                process_features(r)))

labelpointRDD.first()

LabeledPoint(16.0, [1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0])

In [21]:
# 划分数据集、验证集和测试集
(trainData, validationData, testData) = labelpointRDD.randomSplit([7, 1, 2])
print("训练集样本个数："+str(trainData.count()) + " 验证集样本个数：" + str(validationData.count()) + ' 测试集样本个数：' + str(testData.count()))

# 将数据暂存在内存中，加快后续运算效率
trainData.persist()
validationData.persist()
testData.persist()

训练集样本个数：12165 验证集样本个数：1749 测试集样本个数：3465


PythonRDD[21] at RDD at PythonRDD.scala:53

In [22]:
from pyspark.mllib.tree import DecisionTree
model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, impurity="variance", maxDepth=5,
                                   minInstancesPerNode=1, minInfoGain=0.0)

In [23]:
## 使用RMSE对模型进行评估
import numpy as np
from pyspark.mllib.evaluation import RegressionMetrics

## 定义模型评估函数
def RMSE(model, validationData):
    ## 计算模型的准确率
    predict = model.predict(validationData.map(lambda p:p.features))
    ## 拼接预测值和实际值
    predict_real = predict.zip(validationData.map(lambda p: p.label))
    ## 计算均方误差
    rmse = np.sqrt(predict_real.map(lambda p: (p[0]-p[1])**2).sum() / predict_real.count())
    return rmse

## 调用函数求模型在验证集上的准确率
rmse =  RMSE(model, validationData)
print("均方误差RMSE="+str(rmse))

均方误差RMSE=117.07152560810287


In [24]:
## 创建trainEvaluateModel函数包含训练与评估功能，并计算训练评估的时间。
import time

def trainEvaluateModel(trainData, validationData, maxDepthParm, maxBinsParm, minInstancesPerNodeParm, minInfoGainParm):
    startTime = time.time()
    ## 创建并训练模型
    model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, impurity="variance", maxDepth=maxDepthParm, 
                                        maxBins=maxBinsParm, minInstancesPerNode=minInstancesPerNodeParm, minInfoGain=minInfoGainParm)
    ## 计算RMSE
    rmse = RMSE(model, validationData)
    duration = time.time() - startTime   # 持续时间
    print("训练评估：参数"+ ",  maxDepth="+str(maxDepthParm)+",  maxBins="+str(maxBinsParm)+ 
          ", minInstancesPerNode="+str(minInstancesPerNodeParm) +", minInfoGainParm="+str(minInfoGainParm)+"\n"
         "===>消耗时间="+str(duration)+",  均方误差RMSE="+str(rmse))
    return rmse, duration, maxDepthParm, maxBinsParm, minInstancesPerNodeParm, minInfoGainParm, model


In [25]:
## 评估参数 maxDepth
maxDepthList = [3,5, 10, 15, 20,25]
maxBinsList = [10]
minInstancesPerNodeList=[1]
minInfoGainList=[0.0]

## 返回结果存放至metries中
metrics = [trainEvaluateModel(trainData, validationData, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
          for maxDepth in maxDepthList
          for maxBins in maxBinsList
          for minInstancesPerNode in minInstancesPerNodeList
          for minInfoGain in minInfoGainList]

训练评估：参数,  maxDepth=3,  maxBins=10, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=1.0656483173370361,  均方误差RMSE=136.68573954526283
训练评估：参数,  maxDepth=5,  maxBins=10, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=0.8226358890533447,  均方误差RMSE=117.89539783083941
训练评估：参数,  maxDepth=10,  maxBins=10, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=1.2596476078033447,  均方误差RMSE=92.78990914881749
训练评估：参数,  maxDepth=15,  maxBins=10, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=1.8064186573028564,  均方误差RMSE=101.65582131413692
训练评估：参数,  maxDepth=20,  maxBins=10, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=2.6871588230133057,  均方误差RMSE=109.8908807760606
训练评估：参数,  maxDepth=25,  maxBins=10, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=3.9405529499053955,  均方误差RMSE=110.5965520504858


In [26]:
## 评估参数 maxBins
maxDepthList = [10]
maxBinsList = [5,10,15,100,200,500]
minInstancesPerNodeList=[1]
minInfoGainList=[0.0]

## 返回结果存放至metries中
metrics = [trainEvaluateModel(trainData, validationData, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
          for maxDepth in maxDepthList
          for maxBins in maxBinsList
          for minInstancesPerNode in minInstancesPerNodeList
          for minInfoGain in minInfoGainList]

训练评估：参数,  maxDepth=10,  maxBins=5, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=0.9183893203735352,  均方误差RMSE=122.74985589402104
训练评估：参数,  maxDepth=10,  maxBins=10, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=0.7371165752410889,  均方误差RMSE=92.78990914881749
训练评估：参数,  maxDepth=10,  maxBins=15, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=0.9751396179199219,  均方误差RMSE=96.02176637860421
训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=0.9091079235076904,  均方误差RMSE=82.12373108157404
训练评估：参数,  maxDepth=10,  maxBins=200, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=0.9613251686096191,  均方误差RMSE=82.12373108157404
训练评估：参数,  maxDepth=10,  maxBins=500, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=1.0845305919647217,  均方误差RMSE=82.12373108157404


In [27]:
## 评估参数minInstancesPerNode
maxDepthList = [10]
maxBinsList = [100]
minInstancesPerNodeList=[1,3,5,10,20,50]
minInfoGainList=[0.0]

## 返回结果存放至metries中
metrics = [trainEvaluateModel(trainData, validationData, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
          for maxDepth in maxDepthList
          for maxBins in maxBinsList
          for minInstancesPerNode in minInstancesPerNodeList
          for minInfoGain in minInfoGainList]

训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=0.8883147239685059,  均方误差RMSE=82.12373108157404
训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=3, minInfoGainParm=0.0
===>消耗时间=0.6935656070709229,  均方误差RMSE=81.73722335547002
训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=5, minInfoGainParm=0.0
===>消耗时间=0.971778392791748,  均方误差RMSE=82.07496111674335
训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=10, minInfoGainParm=0.0
===>消耗时间=0.6591720581054688,  均方误差RMSE=80.851289017033
训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=20, minInfoGainParm=0.0
===>消耗时间=0.6866352558135986,  均方误差RMSE=83.04188316474618
训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=50, minInfoGainParm=0.0
===>消耗时间=0.9950821399688721,  均方误差RMSE=87.04197538973632


In [28]:
## 评估参数minInfoGain
maxDepthList = [10]
maxBinsList = [100]
minInstancesPerNodeList=[5]
minInfoGainList=[0.0,0.1,0.3,0.5,0.8]

## 返回结果存放至metries中
metrics = [trainEvaluateModel(trainData, validationData, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
          for maxDepth in maxDepthList
          for maxBins in maxBinsList
          for minInstancesPerNode in minInstancesPerNodeList
          for minInfoGain in minInfoGainList]

训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=5, minInfoGainParm=0.0
===>消耗时间=0.8632724285125732,  均方误差RMSE=82.07496111674335
训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=5, minInfoGainParm=0.1
===>消耗时间=0.9569089412689209,  均方误差RMSE=82.07496111674335
训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=5, minInfoGainParm=0.3
===>消耗时间=0.8745958805084229,  均方误差RMSE=82.07499118878282
训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=5, minInfoGainParm=0.5
===>消耗时间=0.815180778503418,  均方误差RMSE=82.07503756132985
训练评估：参数,  maxDepth=10,  maxBins=100, minInstancesPerNode=5, minInfoGainParm=0.8
===>消耗时间=0.8837261199951172,  均方误差RMSE=82.07599663019437


In [32]:
## 定义函数gridSearch网格搜索最佳参数组合

def gridSearch(trainData, validationData, maxDepthList, maxBinsList, minInstancesPerNodeList, minInfoGainList):
    metrics = [trainEvaluateModel(trainData, validationData, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
          for maxDepth in maxDepthList
          for maxBins in maxBinsList
          for minInstancesPerNode in minInstancesPerNodeList
          for minInfoGain in minInfoGainList]
    # 按照RMSE从小到大排序，返回最小RMSE的参数组合
    sorted_metics = sorted(metrics, key=lambda k:k[0], reverse=False)
    best_parameters = sorted_metics[0]
    print("最佳参数组合："+"maxDepth="+str( best_parameters[2]) + 
         ",  maxBins="+str( best_parameters[3])+",  minInstancesPerNode="+str( best_parameters[4])+
          ", minInfoGain="+str(best_parameters[5])+"\n"+
         ",  均方误差RMSE="+str( best_parameters[0]))
    return  best_parameters
## 参数组合
maxDepthList = [3, 5, 10,20,25]
maxBinsList = [30, 50,100,200]
minInstancesPerNodeList=[1,3,5,10,20]
minInfoGainList=[0.0,0.3,0.5]

## 调用函数返回最佳参数组合
best_parameters = gridSearch(trainData, validationData, maxDepthList, maxBinsList, minInstancesPerNodeList, minInfoGainList)

训练评估：参数,  maxDepth=3,  maxBins=30, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=0.507742166519165,  均方误差RMSE=134.96986696651686
训练评估：参数,  maxDepth=3,  maxBins=30, minInstancesPerNode=1, minInfoGainParm=0.3
===>消耗时间=0.4815943241119385,  均方误差RMSE=134.96986696651686
训练评估：参数,  maxDepth=3,  maxBins=30, minInstancesPerNode=1, minInfoGainParm=0.5
===>消耗时间=0.4956505298614502,  均方误差RMSE=134.96986696651686
训练评估：参数,  maxDepth=3,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.0
===>消耗时间=0.6517014503479004,  均方误差RMSE=134.96986696651686
训练评估：参数,  maxDepth=3,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.3
===>消耗时间=0.33863282203674316,  均方误差RMSE=134.96986696651686
训练评估：参数,  maxDepth=3,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.5
===>消耗时间=0.5241448879241943,  均方误差RMSE=134.96986696651686
训练评估：参数,  maxDepth=3,  maxBins=30, minInstancesPerNode=5, minInfoGainParm=0.0
===>消耗时间=0.4717533588409424,  均方误差RMSE=134.96986696651686
训练评估：参数,  maxDepth=3,  maxBins=30, minInstancesP

训练评估：参数,  maxDepth=5,  maxBins=30, minInstancesPerNode=1, minInfoGainParm=0.0
===>消耗时间=0.37120485305786133,  均方误差RMSE=117.3506989731227
训练评估：参数,  maxDepth=5,  maxBins=30, minInstancesPerNode=1, minInfoGainParm=0.3
===>消耗时间=0.5525491237640381,  均方误差RMSE=117.3506989731227
训练评估：参数,  maxDepth=5,  maxBins=30, minInstancesPerNode=1, minInfoGainParm=0.5
===>消耗时间=0.5638318061828613,  均方误差RMSE=117.3506989731227
训练评估：参数,  maxDepth=5,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.0
===>消耗时间=0.6856913566589355,  均方误差RMSE=117.3506989731227
训练评估：参数,  maxDepth=5,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.3
===>消耗时间=0.35347604751586914,  均方误差RMSE=117.3506989731227
训练评估：参数,  maxDepth=5,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.5
===>消耗时间=0.529595136642456,  均方误差RMSE=117.3506989731227
训练评估：参数,  maxDepth=5,  maxBins=30, minInstancesPerNode=5, minInfoGainParm=0.0
===>消耗时间=0.5260641574859619,  均方误差RMSE=117.3506989731227
训练评估：参数,  maxDepth=5,  maxBins=30, minInstancesPerNode

训练评估：参数,  maxDepth=10,  maxBins=30, minInstancesPerNode=1, minInfoGainParm=0.3
===>消耗时间=0.7216148376464844,  均方误差RMSE=82.45898968787418
训练评估：参数,  maxDepth=10,  maxBins=30, minInstancesPerNode=1, minInfoGainParm=0.5
===>消耗时间=0.5562825202941895,  均方误差RMSE=82.45903584447171
训练评估：参数,  maxDepth=10,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.0
===>消耗时间=0.8900055885314941,  均方误差RMSE=81.87599246233162
训练评估：参数,  maxDepth=10,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.3
===>消耗时间=0.7234909534454346,  均方误差RMSE=81.87601345998226
训练评估：参数,  maxDepth=10,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.5
===>消耗时间=0.8777132034301758,  均方误差RMSE=81.87602881385324
训练评估：参数,  maxDepth=10,  maxBins=30, minInstancesPerNode=5, minInfoGainParm=0.0
===>消耗时间=0.7053430080413818,  均方误差RMSE=82.5111085073634
训练评估：参数,  maxDepth=10,  maxBins=30, minInstancesPerNode=5, minInfoGainParm=0.3
===>消耗时间=0.5162217617034912,  均方误差RMSE=82.51113049829341
训练评估：参数,  maxDepth=10,  maxBins=30, minInstancesP

训练评估：参数,  maxDepth=20,  maxBins=30, minInstancesPerNode=1, minInfoGainParm=0.3
===>消耗时间=2.4887514114379883,  均方误差RMSE=91.27937585749561
训练评估：参数,  maxDepth=20,  maxBins=30, minInstancesPerNode=1, minInfoGainParm=0.5
===>消耗时间=2.5495829582214355,  均方误差RMSE=91.2786697705749
训练评估：参数,  maxDepth=20,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.0
===>消耗时间=1.6604368686676025,  均方误差RMSE=86.63066341673702
训练评估：参数,  maxDepth=20,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.3
===>消耗时间=1.5845768451690674,  均方误差RMSE=86.6302102792436
训练评估：参数,  maxDepth=20,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.5
===>消耗时间=1.6140329837799072,  均方误差RMSE=86.62960748403854
训练评估：参数,  maxDepth=20,  maxBins=30, minInstancesPerNode=5, minInfoGainParm=0.0
===>消耗时间=1.3153154850006104,  均方误差RMSE=84.04292539498913
训练评估：参数,  maxDepth=20,  maxBins=30, minInstancesPerNode=5, minInfoGainParm=0.3
===>消耗时间=1.2961692810058594,  均方误差RMSE=84.04261635517669
训练评估：参数,  maxDepth=20,  maxBins=30, minInstancesPe

训练评估：参数,  maxDepth=25,  maxBins=30, minInstancesPerNode=1, minInfoGainParm=0.5
===>消耗时间=3.083878993988037,  均方误差RMSE=91.93829213737237
训练评估：参数,  maxDepth=25,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.0
===>消耗时间=2.0520153045654297,  均方误差RMSE=86.99918913412849
训练评估：参数,  maxDepth=25,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.3
===>消耗时间=1.951479196548462,  均方误差RMSE=86.99873791612063
训练评估：参数,  maxDepth=25,  maxBins=30, minInstancesPerNode=3, minInfoGainParm=0.5
===>消耗时间=2.0724706649780273,  均方误差RMSE=86.99813767438053
训练评估：参数,  maxDepth=25,  maxBins=30, minInstancesPerNode=5, minInfoGainParm=0.0
===>消耗时间=1.5335450172424316,  均方误差RMSE=84.10020101592548
训练评估：参数,  maxDepth=25,  maxBins=30, minInstancesPerNode=5, minInfoGainParm=0.3
===>消耗时间=1.5027413368225098,  均方误差RMSE=84.09989218658237
训练评估：参数,  maxDepth=25,  maxBins=30, minInstancesPerNode=5, minInfoGainParm=0.5
===>消耗时间=1.5348410606384277,  均方误差RMSE=84.10023715546937
训练评估：参数,  maxDepth=25,  maxBins=30, minInstancesPe