In [2]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler,StandardScaler

%matplotlib inline

In [3]:
from pyspark.sql import SQLContext

In [4]:
from pyspark import SparkConf, SparkContext

sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local[2]"))

In [5]:
sqlContext=SQLContext(sc)

In [6]:
df=sqlContext.read.load("minute_weather.csv",format="com.databricks.spark.csv",header='true',inferSchema="true")

In [7]:
df.count()

1587257

In [8]:
df.columns

['rowID',
 'hpwren_timestamp',
 'air_pressure',
 'air_temp',
 'avg_wind_direction',
 'avg_wind_speed',
 'max_wind_direction',
 'max_wind_speed',
 'min_wind_direction',
 'min_wind_speed',
 'rain_accumulation',
 'rain_duration',
 'relative_humidity']

In [9]:
df.describe().show()

+-------+-----------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+--------------------+------------------+-----------------+
|summary|            rowID|      air_pressure|          air_temp|avg_wind_direction|   avg_wind_speed|max_wind_direction|    max_wind_speed|min_wind_direction|    min_wind_speed|   rain_accumulation|     rain_duration|relative_humidity|
+-------+-----------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+--------------------+------------------+-----------------+
|  count|          1587257|           1587257|           1587257|           1586824|          1586824|           1586824|           1586824|           1586824|           1586824|             1587256|           1587256|          1587257|
|   mean|         793628.0| 916.8301266904355| 61.85

In [10]:
# filter data by removing 10% data

filterDf=df.filter((df.rowID%5)==0)
filterDf.count()

317452

In [16]:
filterDf.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
rowID,158726,793625.0,458203.9375103623,0,1587250
air_pressure,158726,916.830161410252,3.051716552831359,905.0,929.5
air_temp,158726,61.85158915363659,11.833569210641672,31.64,99.5
avg_wind_direction,158680,162.15610032770354,95.27820101905958,0.0,359.0
avg_wind_speed,158680,2.7752148979077376,2.0576239697426337,0.0,31.9
max_wind_direction,158680,163.46214393748426,92.452138538387,0.0,359.0
max_wind_speed,158680,3.4005577262415194,2.4188016208098855,0.1,36.0
min_wind_direction,158680,166.77401688933702,97.44110914784576,0.0,359.0
min_wind_speed,158680,2.1346641038569323,1.7421125052424373,0.0,31.6


In [17]:
filterDf.filter(filterDf.rain_accumulation == 0).count()

157812

In [18]:
filterDf.filter(filterDf.rain_duration == 0).count()

157237

In [19]:
# Since Above Variables have lots of 0 values remove them from the data frame

workingDf=filterDf.drop('rain_accumulation').drop('rain_duration').drop('hpwren_timestamp')

In [20]:
workingDf.columns

['rowID',
 'air_pressure',
 'air_temp',
 'avg_wind_direction',
 'avg_wind_speed',
 'max_wind_direction',
 'max_wind_speed',
 'min_wind_direction',
 'min_wind_speed',
 'relative_humidity']

In [21]:
#Remove na dcount number of missing values in the new data frame

before=workingDf.count()
workingDf=workingDf.na.drop()
after=workingDf.count()
before-after

46

In [22]:
featuresUsed=['air_pressure',
 'air_temp',
 'avg_wind_direction',
 'avg_wind_speed',
 'max_wind_direction',
 'max_wind_speed',
 'relative_humidity']

In [23]:
assembler=VectorAssembler(inputCols=featuresUsed,outputCol="features_unscaled")

In [24]:
assembled=assembler.transform(workingDf)

In [25]:
# Scaling Features

scaler=StandardScaler(inputCol="features_unscaled",outputCol="features",withStd=True,withMean=True)
scalerModel=scaler.fit(assembled)
scaledData=scalerModel.transform(assembled)

In [26]:
scaledData=scaledData.select("features","rowID")
elbowset=scaledData.filter((scaledData.rowID%3)==0).select("features")

In [27]:
# Use persist to keep elbowset in memory and make the process fast
elbowset.persist()

DataFrame[features: vector]

In [43]:
scaledDataFeat=scaledData.select('features')
scaledDataFeat.persist()

DataFrame[features: vector]

In [44]:
kMeans=KMeans(k=12,seed=1)
model=kMeans.fit(scaledDataFeat)

In [45]:
transformed=model.transform(scaledDataFeat)

In [46]:
model.clusterCenters()

[array([-0.16566547,  0.85448149, -1.31151111, -0.58895341, -1.16838482,
        -0.60439674, -0.6364679 ]),
 array([ 0.41691433, -0.98753852,  0.70550415, -0.54187353,  0.90244238,
        -0.52156142,  1.0843019 ]),
 array([-1.17160552, -0.89138025,  0.44220135,  1.93226779,  0.53544953,
         1.89401088,  0.93029236]),
 array([-0.80206962, -1.22440039,  0.3841726 ,  0.26485497,  0.48086319,
         0.25434775,  1.4069788 ]),
 array([-0.8539949 ,  0.24450819,  0.15438223, -0.50784111,  0.31999836,
        -0.52039833,  0.14122307]),
 array([ 0.23528561,  0.32423272,  1.88809848, -0.65298505, -1.55105546,
        -0.57787145, -0.287234  ]),
 array([ 1.38392879, -0.08927944, -1.18167841, -0.06170904, -1.0474974 ,
        -0.04146049, -0.98135819]),
 array([-0.16984403,  0.63162877,  0.41078545,  0.74631684,  0.51842117,
         0.68293388, -0.17496862]),
 array([ 1.18816163, -0.25383448, -1.15547963,  2.12139724, -1.0537412 ,
         2.23836726, -1.13402593]),
 array([ 0.35549386