In [1]:
schour.master

u'local[*]'

In [6]:
hour_df = spark.read.format("csv").option("header",True).option("delimiter",",").load("/input/hour.csv")

In [7]:
hour_df.columns

['instant',
 'dteday',
 'season',
 'yr',
 'mnth',
 'hr',
 'holiday',
 'weekday',
 'workingday',
 'weathersit',
 'temp',
 'atemp',
 'hum',
 'windspeed',
 'casual',
 'registered',
 'cnt']

In [8]:
hour_df = hour_df.drop("instant").drop("dteday").drop("yr").drop("casual").drop("registered")

In [10]:
hour_df.printSchema()

root
 |-- season: string (nullable = true)
 |-- mnth: string (nullable = true)
 |-- hr: string (nullable = true)
 |-- holiday: string (nullable = true)
 |-- weekday: string (nullable = true)
 |-- workingday: string (nullable = true)
 |-- weathersit: string (nullable = true)
 |-- temp: string (nullable = true)
 |-- atemp: string (nullable = true)
 |-- hum: string (nullable = true)
 |-- windspeed: string (nullable = true)
 |-- cnt: string (nullable = true)



In [11]:
from pyspark.sql.functions import col

In [13]:
hour_df = hour_df.select([col(column).cast("double").alias(column) for column in hour_df.columns] )

In [15]:
hour_df.dtypes

[('season', 'double'),
 ('mnth', 'double'),
 ('hr', 'double'),
 ('holiday', 'double'),
 ('weekday', 'double'),
 ('workingday', 'double'),
 ('weathersit', 'double'),
 ('temp', 'double'),
 ('atemp', 'double'),
 ('hum', 'double'),
 ('windspeed', 'double'),
 ('cnt', 'double')]

In [16]:
hour_df.show(5)

+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|season|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed| cnt|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|   1.0| 1.0|0.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.81|      0.0|16.0|
|   1.0| 1.0|1.0|    0.0|    6.0|       0.0|       1.0|0.22|0.2727| 0.8|      0.0|40.0|
|   1.0| 1.0|2.0|    0.0|    6.0|       0.0|       1.0|0.22|0.2727| 0.8|      0.0|32.0|
|   1.0| 1.0|3.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.75|      0.0|13.0|
|   1.0| 1.0|4.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.75|      0.0| 1.0|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
only showing top 5 rows



In [17]:
train_df,test_df = hour_df.randomSplit([0.7,0.3])

In [18]:
train_df.cache()
test_df.cache()

DataFrame[season: double, mnth: double, hr: double, holiday: double, weekday: double, workingday: double, weathersit: double, temp: double, atemp: double, hum: double, windspeed: double, cnt: double]

In [20]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer,VectorAssembler,VectorIndexer
from pyspark.ml.regression import DecisionTreeRegressor

In [21]:
featureCols = hour_df.columns[:-1]

In [52]:
vectoAssember = VectorAssembler(inputCols=featureCols,outputCol="aFeatures")

In [53]:
vectorindexer = VectorIndexer(inputCol="aFeatures",outputCol="features",maxCategories=24)

In [28]:
dt = DecisionTreeRegressor(labelCol="cnt",featuresCol="features")

In [29]:
from pyspark.ml import Pipeline
dt_pipeline = Pipeline(stages=[vectoAssember,vectorindexer,dt])

In [54]:
dt_pipelineModel = dt_pipeline.fit(train_df)

In [55]:
dt_pipelineModel.stages[2]

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4cbc889394e538c8eeea) of depth 5 with 63 nodes

In [56]:
predicted_df = dt_pipelineModel.transform(test_df)

In [57]:
predicted_df.columns

['season',
 'mnth',
 'hr',
 'holiday',
 'weekday',
 'workingday',
 'weathersit',
 'temp',
 'atemp',
 'hum',
 'windspeed',
 'cnt',
 'aFeatures',
 'features',
 'prediction']

In [60]:
predicted_df.select("cnt","prediction").show(5)

+----+------------------+
| cnt|        prediction|
+----+------------------+
|59.0| 59.30487804878049|
|33.0| 59.30487804878049|
|91.0| 59.30487804878049|
| 7.0|37.438395415472776|
|12.0|37.438395415472776|
+----+------------------+
only showing top 5 rows



In [61]:
from pyspark.ml.evaluation import RegressionEvaluator

In [62]:
evalutor = RegressionEvaluator(labelCol="cnt",predictionCol="prediction",metricName="rmse")

In [63]:
rmse = evalutor.evaluate(predicted_df)

In [64]:
rmse

99.28882070604166

In [65]:
from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit

In [66]:
paramGrid = ParamGridBuilder()\
.addGrid(dt.maxDepth,[5,10,15,25])\
.addGrid(dt.maxBins,[25,35,45,50])\
.build()

In [68]:
tvs = TrainValidationSplit(estimator=dt,evaluator=evalutor,estimatorParamMaps=paramGrid,trainRatio=0.8)

In [73]:
tvs_pipeline = Pipeline(stages=[vectoAssember,vectorindexer,tvs])

In [74]:
tvs_pipelineModel = tvs_pipeline.fit(train_df)

In [75]:
predictions = tvs_pipelineModel.transform(test_df)
rmse = evalutor.evaluate(predictions)
rmse

82.14427234919033

In [76]:
bestModel = tvs_pipelineModel.stages[2].bestModel
bestModel

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4cbc889394e538c8eeea) of depth 10 with 1743 nodes

In [77]:
type(bestModel)

pyspark.ml.regression.DecisionTreeRegressionModel

In [81]:
from pyspark.ml.tuning import CrossValidator


In [82]:
cv = CrossValidator(estimator=dt,evaluator=evalutor,estimatorParamMaps=paramGrid,numFolds=3)

In [83]:
cv_pipeline = Pipeline(stages=[vectoAssember,vectorindexer,cv])

In [84]:
cv_pipelineModel = cv_pipeline.fit(train_df)

In [85]:
predictions = cv_pipelineModel.transform(test_df)  

In [86]:
rmse = evalutor.evaluate(predictions)

In [87]:
rmse

82.72006422153424

In [88]:
from pyspark.ml.regression import GBTRegressor

In [89]:
gbt = GBTRegressor(labelCol="cnt",featuresCol="features")

In [90]:
gbt_pipeline = Pipeline(stages=[vectoAssember,vectorindexer,gbt])

In [91]:
parmGrid = ParamGridBuilder().addGrid(gbt.maxDepth,[5,10]).addGrid(gbt.maxBins,[25,40]).addGrid(gbt.maxIter,[10,50]).build()

In [92]:
cv = CrossValidator(estimator=gbt,evaluator=evalutor,estimatorParamMaps=paramGrid,numFolds=3)

In [93]:
cv_pipeline = Pipeline(stages=[vectoAssember,vectorindexer,cv])

In [94]:
cv_pipelineModel = cv_pipeline.fit(train_df)

In [95]:
predictions = cv_pipelineModel.transform(test_df)

In [96]:
rmse = evalutor.evaluate(predictions)

In [97]:
rmse

76.25497291952546