In [None]:
rawRDD = sc.textFile('data/covtype.data.gz')

In [None]:
column_names = (
["Elevation",
"Aspect",
"Slope",
"Horizontal_Distance_To_Hydrology",
"Vertical_Distance_To_Hydrology",
"Horizontal_Distance_To_Roadways",
"Hillshade_9am",
"Hillshade_Noon",
"Hillshade_3pm",
"Horizontal_Distance_To_Fire_Points"]
    + ['WE{}'.format(i) for i in range(4)]
    + ['ST{}'.format(i) for i in range(40)]
    + ['Cover'])

In [None]:
from pyspark.sql import Row

In [None]:
cover = spark.createDataFrame(
    rawRDD
     .map(lambda x: [float(i) for i in x.split(',')])
     .map(lambda x: Row(**{col:v for col, v in zip(column_names, x)})))

In [None]:
cover.printSchema()

In [None]:
cover['elevation', 'aspect', 'slope'].show(5)

In [None]:
cover.select(cover['elevation'], cover['slope']).show(5)

In [None]:
cover.select(cover['elevation'] / 10).show(5)

In [None]:
cover.filter(cover['slope']>10).select(cover['elevation'], cover['slope']).show(5)

In [None]:
d = cover.sample(True, 0.05)

In [None]:
d.cache()

In [None]:
d.groupBy('Cover').agg({'slope': 'avg'}).show()

In [None]:
d.groupBy('Cover').agg({'slope': 'avg', 'slope': 'min', 'aspect': 'max'}).show()

In [None]:
d.groupBy('Cover').agg({'slope': 'avg', 'slope': 'min', 'aspect': 'max'}).toPandas()

In [None]:
d.groupBy('Cover').agg({'slope': 'avg', 'slope': 'min', 'aspect': 'max'}).toJSON().collect()

In [None]:
d.cov('Slope', 'Elevation')

In [None]:
d.approxQuantile('Elevation', [0.25, 0.5, 0.75], 0.1)

In [None]:
d.corr('Slope', 'Elevation')

In [None]:
d.describe('Slope').show()

In [None]:
dna = spark.createDataFrame(
    sc.parallelize([Row(name='John', age=51), Row(name='Jim', age=None), Row(name=None, age=56)]))

In [None]:
dna.show()

In [None]:
dna.dropna().show()

In [None]:
dna.na.fill({'name': 'unkown', 'age': 99}).show()

In [None]:
d.freqItems(['Slope', 'Elevation']).show()

In [None]:
from pyspark.sql import functions as fn

In [None]:
d.agg(fn.approxCountDistinct(d.Slope).alias('dSlope')).show()

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
feature_maker = VectorAssembler(inputCols=column_names[:-1], outputCol='features')

In [None]:
data = feature_maker.transform(cover)

In [None]:
train, test = data.randomSplit([0.7, 0.3])

In [None]:
model = LogisticRegression(labelCol='Cover').fit(train)

In [None]:
coefs = model.coefficientMatrix.toArray()

In [None]:
result = model.evaluate(test.sample(False, 0.05))

In [None]:
result.predictions.select((result.predictions.Cover == result.predictions.prediction).cast('float').alias('hit')
                         ).groupBy().avg('hit').collect()

In [None]:
for name, c in zip (column_names, coefs[0,:]):
    print name, c