In [6]:
rawRDD = sc.textFile('data/covtype.data.gz')

In [7]:
column_names = (
["Elevation",
"Aspect",
"Slope",
"Horizontal_Distance_To_Hydrology",
"Vertical_Distance_To_Hydrology",
"Horizontal_Distance_To_Roadways",
"Hillshade_9am",
"Hillshade_Noon",
"Hillshade_3pm",
"Horizontal_Distance_To_Fire_Points"]
    + ['WE{}'.format(i) for i in range(4)]
    + ['ST{}'.format(i) for i in range(40)]
    + ['Cover'])

In [8]:
from pyspark.sql import Row

In [9]:
cover = spark.createDataFrame(
    rawRDD
     .map(lambda x: [float(i) for i in x.split(',')])
     .map(lambda x: Row(**{col:v for col, v in zip(column_names, x)})))

In [50]:
cover.printSchema()

root
 |-- Aspect: double (nullable = true)
 |-- Cover: double (nullable = true)
 |-- Elevation: double (nullable = true)
 |-- Hillshade_3pm: double (nullable = true)
 |-- Hillshade_9am: double (nullable = true)
 |-- Hillshade_Noon: double (nullable = true)
 |-- Horizontal_Distance_To_Fire_Points: double (nullable = true)
 |-- Horizontal_Distance_To_Hydrology: double (nullable = true)
 |-- Horizontal_Distance_To_Roadways: double (nullable = true)
 |-- ST0: double (nullable = true)
 |-- ST1: double (nullable = true)
 |-- ST10: double (nullable = true)
 |-- ST11: double (nullable = true)
 |-- ST12: double (nullable = true)
 |-- ST13: double (nullable = true)
 |-- ST14: double (nullable = true)
 |-- ST15: double (nullable = true)
 |-- ST16: double (nullable = true)
 |-- ST17: double (nullable = true)
 |-- ST18: double (nullable = true)
 |-- ST19: double (nullable = true)
 |-- ST2: double (nullable = true)
 |-- ST20: double (nullable = true)
 |-- ST21: double (nullable = true)
 |-- ST22: do

In [10]:
cover['elevation', 'aspect', 'slope'].show(5)

+---------+------+-----+
|elevation|aspect|slope|
+---------+------+-----+
|   2596.0|  51.0|  3.0|
|   2590.0|  56.0|  2.0|
|   2804.0| 139.0|  9.0|
|   2785.0| 155.0| 18.0|
|   2595.0|  45.0|  2.0|
+---------+------+-----+
only showing top 5 rows



In [11]:
cover.select(cover['elevation'], cover['slope']).show(5)

+---------+-----+
|elevation|slope|
+---------+-----+
|   2596.0|  3.0|
|   2590.0|  2.0|
|   2804.0|  9.0|
|   2785.0| 18.0|
|   2595.0|  2.0|
+---------+-----+
only showing top 5 rows



In [12]:
cover.select(cover['elevation'] / 10).show(5)

+----------------+
|(elevation / 10)|
+----------------+
|           259.6|
|           259.0|
|           280.4|
|           278.5|
|           259.5|
+----------------+
only showing top 5 rows



In [15]:
cover.filter(cover['slope']>10).select(cover['elevation'], cover['slope']).show(5)

+---------+-----+
|elevation|slope|
+---------+-----+
|   2785.0| 18.0|
|   2886.0| 11.0|
|   2742.0| 22.0|
|   2880.0| 17.0|
|   2768.0| 23.0|
+---------+-----+
only showing top 5 rows



In [21]:
d = cover.sample(True, 0.05)

In [23]:
d.cache()

DataFrame[Aspect: double, Cover: double, Elevation: double, Hillshade_3pm: double, Hillshade_9am: double, Hillshade_Noon: double, Horizontal_Distance_To_Fire_Points: double, Horizontal_Distance_To_Hydrology: double, Horizontal_Distance_To_Roadways: double, ST0: double, ST1: double, ST10: double, ST11: double, ST12: double, ST13: double, ST14: double, ST15: double, ST16: double, ST17: double, ST18: double, ST19: double, ST2: double, ST20: double, ST21: double, ST22: double, ST23: double, ST24: double, ST25: double, ST26: double, ST27: double, ST28: double, ST29: double, ST3: double, ST30: double, ST31: double, ST32: double, ST33: double, ST34: double, ST35: double, ST36: double, ST37: double, ST38: double, ST39: double, ST4: double, ST5: double, ST6: double, ST7: double, ST8: double, ST9: double, Slope: double, Vertical_Distance_To_Hydrology: double, WE0: double, WE1: double, WE2: double, WE3: double]

In [22]:
d.groupBy('Cover').agg({'slope': 'avg'}).show()

+-----+------------------+
|Cover|        avg(slope)|
+-----+------------------+
|  7.0|14.536800785083415|
|  1.0|13.070262664165103|
|  4.0| 19.56081081081081|
|  3.0|20.445945945945947|
|  2.0|13.581538244226287|
|  6.0| 18.97245508982036|
|  5.0|17.031390134529147|
+-----+------------------+



In [41]:
d.groupBy('Cover').agg({'slope': 'avg', 'slope': 'min', 'aspect': 'max'}).show()

+-----+-----------+----------+
|Cover|max(aspect)|min(slope)|
+-----+-----------+----------+
|  7.0|      359.0|       1.0|
|  1.0|      359.0|       0.0|
|  4.0|      354.0|       2.0|
|  3.0|      360.0|       0.0|
|  2.0|      359.0|       0.0|
|  6.0|      359.0|       1.0|
|  5.0|      357.0|       2.0|
+-----+-----------+----------+



In [51]:
d.groupBy('Cover').agg({'slope': 'avg', 'slope': 'min', 'aspect': 'max'}).toPandas()

Unnamed: 0,Cover,max(aspect),min(slope)
0,7.0,359.0,1.0
1,1.0,359.0,0.0
2,4.0,354.0,2.0
3,3.0,360.0,0.0
4,2.0,359.0,0.0
5,6.0,359.0,1.0
6,5.0,357.0,2.0


In [53]:
d.groupBy('Cover').agg({'slope': 'avg', 'slope': 'min', 'aspect': 'max'}).toJSON().collect()

[u'{"Cover":7.0,"max(aspect)":359.0,"min(slope)":1.0}',
 u'{"Cover":1.0,"max(aspect)":359.0,"min(slope)":0.0}',
 u'{"Cover":4.0,"max(aspect)":354.0,"min(slope)":2.0}',
 u'{"Cover":3.0,"max(aspect)":360.0,"min(slope)":0.0}',
 u'{"Cover":2.0,"max(aspect)":359.0,"min(slope)":0.0}',
 u'{"Cover":6.0,"max(aspect)":359.0,"min(slope)":1.0}',
 u'{"Cover":5.0,"max(aspect)":357.0,"min(slope)":2.0}']

In [25]:
d.cov('Slope', 'Elevation')

-498.92929709434827

In [26]:
d.approxQuantile('Elevation', [0.25, 0.5, 0.75], 0.1)

[2845.0, 2940.0, 3096.0]

In [27]:
d.corr('Slope', 'Elevation')

-0.238505570137636

In [29]:
d.describe('Slope').show()

+-------+-----------------+
|summary|            Slope|
+-------+-----------------+
|  count|            29043|
|   mean|14.08559721791826|
| stddev|7.496139550768641|
|    min|              0.0|
|    max|             53.0|
+-------+-----------------+



In [32]:
dna = spark.createDataFrame(
    sc.parallelize([Row(name='John', age=51), Row(name='Jim', age=None), Row(name=None, age=56)]))

In [34]:
dna.show()

+----+----+
| age|name|
+----+----+
|  51|John|
|null| Jim|
|  56|null|
+----+----+



In [35]:
dna.dropna().show()

+---+----+
|age|name|
+---+----+
| 51|John|
+---+----+



In [37]:
dna.na.fill({'name': 'unkown', 'age': 99}).show()

+---+------+
|age|  name|
+---+------+
| 51|  John|
| 99|   Jim|
| 56|unkown|
+---+------+



In [46]:
d.freqItems(['Slope', 'Elevation']).show()

+--------------------+--------------------+
|     Slope_freqItems| Elevation_freqItems|
+--------------------+--------------------+
|[23.0, 41.0, 32.0...|[2407.0, 2443.0, ...|
+--------------------+--------------------+



In [54]:
from pyspark.sql import functions as fn

In [57]:
d.agg(fn.approxCountDistinct(d.Slope).alias('dSlope')).show()

+------+
|dSlope|
+------+
|    48|
+------+



In [58]:
from pyspark.ml.classification import LogisticRegression

In [59]:
from pyspark.ml.feature import VectorAssembler

In [60]:
feature_maker = VectorAssembler(inputCols=column_names[:-1], outputCol='features')

In [61]:
data = feature_maker.transform(cover)

In [62]:
train, test = data.randomSplit([0.7, 0.3])

In [63]:
model = LogisticRegression(labelCol='Cover').fit(train)

In [70]:
coefs = model.coefficientMatrix.toArray()

In [86]:
result = model.evaluate(test.sample(False, 0.05))

In [97]:
result.predictions.select((result.predictions.Cover == result.predictions.prediction).cast('float').alias('hit')
                         ).groupBy().avg('hit').collect()

[Row(avg(hit)=0.7179040316021843)]

In [76]:
for name, c in zip (column_names, coefs[0,:]):
    print name, c

Elevation -6.45279652042e-06
Aspect -2.13344391291e-06
Slope -4.9457617387e-05
Horizontal_Distance_To_Hydrology -9.38589203577e-07
Vertical_Distance_To_Hydrology -2.47589152659e-06
Horizontal_Distance_To_Roadways -1.31143009512e-07
Hillshade_9am -5.26193978729e-05
Hillshade_Noon -0.000100249318218
Hillshade_3pm -1.66596531436e-05
Horizontal_Distance_To_Fire_Points -1.63066153936e-07
WE0 -0.0001585296118
WE1 -0.000101902090242
WE2 -0.000469264163037
WE3 -0.00029945437528
ST0 -0.000166525313459
ST1 -0.000418309253342
ST2 -0.000378139508961
ST3 -0.00049152436918
ST4 -0.000166351018859
ST5 -0.000416545564359
ST6 -2.37201171316e-05
ST7 -4.97768046354e-05
ST8 -9.31760978478e-05
ST9 -0.000444051115756
ST10 -0.000490941239731
ST11 -5.40127631024e-05
ST12 -0.000312155905246
ST13 -0.00028726149732
ST14 -3.9215551332e-05
ST15 -0.000298699188156
ST16 -0.00105283354103
ST17 -9.9827603025e-05
ST18 -0.000102570077866
ST19 -0.000157779420483
ST20 -7.30806351459e-05
ST21 -8.96589437207e-05
ST22 -0.0001