In [1]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

In [2]:
sqlContext = SQLContext(sc)
df = sqlContext.read.load('file:///home/cloudera/Desktop/bigdatastudy/big-data-4/daily_weather.csv',
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')
df.columns

['number',
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

In [3]:
featureColumns = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am',
        'max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am',
        'rain_duration_9am']

In [4]:
df= df.drop('number')

In [7]:
df=df.na.drop()

In [8]:
df.count(),len(df.columns)

(1064, 10)

In [9]:
binarizer=Binarizer(threshold=24.9999,inputCol='relative_humidity_3pm',outputCol='label')
binarizerDF=binarizer.transform(df)

In [10]:
binarizerDF.select('relative_humidity_3pm','label').show(4)

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761848|  0.0|
+---------------------+-----+
only showing top 4 rows



In [12]:
assembler = VectorAssembler(inputCols=featureColumns,outputCol="features")
assembled= assembler.transform(binarizerDF)

In [29]:
(trainingData, testData)= assembled.randomSplit([0.8,0.2],seed=13234)

In [30]:
trainingData.count(), testData.count()

(854, 210)

In [31]:
dt= DecisionTreeClassifier(labelCol="label",featuresCol="features",maxDepth=5,minInstancesPerNode=20,impurity="gini")

In [32]:
pipeline= Pipeline(stages=[dt])
model = pipeline.fit(trainingData)

In [33]:
predictions = model.transform(testData)

In [36]:
predictions.select("prediction","label","features").show(20)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  1.0|[908.970000000004...|
|       1.0|  1.0|[912.890000000011...|
|       1.0|  1.0|[912.990000000012...|
|       1.0|  1.0|[913.060000000003...|
|       1.0|  1.0|[913.070000000008...|
|       1.0|  1.0|[913.490000000008...|
|       0.0|  0.0|[913.970000000012...|
|       1.0|  1.0|[914.490000000003...|
|       1.0|  1.0|[915.030000000007...|
|       1.0|  1.0|[915.100000000002...|
|       0.0|  0.0|[915.101901435935...|
|       1.0|  1.0|[915.600000000005...|
|       1.0|  1.0|[915.600000000006...|
|       1.0|  1.0|[915.700000000002...|
|       0.0|  0.0|[915.710000000005...|
|       1.0|  1.0|[915.840000000005...|
|       1.0|  1.0|[915.870000000005...|
|       0.0|  0.0|[915.914435739919...|
|       0.0|  1.0|[916.040000000011...|
|       1.0|  1.0|[916.100000000008...|
+----------+-----+--------------------+
only showing top 20 rows



In [26]:
predictions.select("prediction","label").write.save('file:///home/cloudera/Desktop/bigdatastudy/big-data-4/predictions.csv',
                          format='com.databricks.spark.csv', 
                          header='true')