In [19]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler,StringIndexer,VectorIndexer

In [2]:
from pyspark import SparkConf, SparkContext

sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local[2]"))

In [3]:
sqlContext=SQLContext(sc)

In [8]:
df=sqlContext.read.load("daily_weather.csv",format="com.databricks.spark.csv",header='true',inferSchema="true")

In [9]:
featureColumns=[
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am']

In [10]:
df=df.drop('number')


In [11]:
# Remove All the rows with missing data
df=df.na.drop()

In [12]:
df.count(),len(df.columns)

(1064, 10)

In [16]:
#Make new daraframe for low and not_low relative humidty (binarize one)

binarizer=Binarizer(threshold=24.99999,inputCol='relative_humidity_3pm',outputCol='label')
binarizeDf=binarizer.transform(df)

In [24]:
binarizeDf.select('relative_humidity_3pm','label').show(5)

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761848|  0.0|
|    76.74000000000046|  1.0|
+---------------------+-----+
only showing top 5 rows



In [27]:
# Aggreagte features used for the prediction in to a singal column

assembler=VectorAssembler(inputCols=featureColumns,outputCol="features")
assembled=assembler.transform(binarizeDf)

In [33]:
assembled.select('features','label').show(4)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[918.060000000008...|  1.0|
|[917.347688117709...|  0.0|
|[923.040000000008...|  0.0|
|[920.502751175919...|  0.0|
+--------------------+-----+
only showing top 4 rows



In [56]:
(trainingData,testingData)=assembled.randomSplit([0.8,0.2],seed=13234)

In [57]:
trainingData.count(),testingData.count()

(742, 322)

In [42]:
dt=DecisionTreeClassifier(labelCol='label',featuresCol='features',maxDepth=5,minInstancesPerNode=20,impurity="gini")

In [44]:
pipeline=Pipeline(stages=[dt])
model=pipeline.fit(trainingData)

In [46]:
predictions=model.transform(testingData)

In [55]:
predictions.select('features','label','prediction').show(20)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[908.970000000004...|  1.0|       1.0|
|[911.450000000007...|  1.0|       0.0|
|[911.600000000008...|  1.0|       1.0|
|[911.680000000008...|  1.0|       1.0|
|[911.700000000011...|  1.0|       1.0|
|[912.640000000008...|  1.0|       1.0|
|[912.990000000012...|  1.0|       1.0|
|[913.200000000008...|  1.0|       1.0|
|[913.660000000007...|  1.0|       0.0|
|[913.700000000008...|  1.0|       1.0|
|[913.700000000010...|  1.0|       1.0|
|[914.300000000007...|  1.0|       1.0|
|[914.360000000011...|  1.0|       1.0|
|[914.600000000007...|  1.0|       1.0|
|[914.621061536155...|  0.0|       1.0|
|[914.670000000008...|  1.0|       1.0|
|[914.700000000008...|  1.0|       1.0|
|[914.800000000004...|  1.0|       1.0|
|[914.830000000006...|  0.0|       0.0|
|[914.860000000008...|  1.0|       1.0|
+--------------------+-----+----------+
only showing top 20 rows



In [66]:
# Save Predictions to a CSV File
predictions.select("prediction", "label").toPandas().to_csv('predictions.csv', index=False)


#Below one is Not Worked
predictions.select("prediction", "label").write.save(path="file:///home/cloudera/Downloads/big-data-4/predictions.csv",
                                                     format="com.databricks.spark.csv",
                                                     header='true')