# PySpark with HDFS
Import libraries

In [1]:
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.functions import lit, udf
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier as RF, LogisticRegression as LR
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

Waiting for a Spark session to start...

Waiting for a Spark session to start...

In [2]:
def ith_(v, i):
    try:
        return float(v[i])
    except ValueError:
        return None

ith = udf(ith_, DoubleType())

## Load data from HDFS
Load data and cast strings to integers

In [3]:
df_load = spark.read.csv('hdfs:///circle.csv', header="true")
df_load = df_load.withColumn("x1", df_load["x1"].cast(DoubleType()))
df_load = df_load.withColumn("x2", df_load["x2"].cast(DoubleType()))
df_load = df_load.withColumn("y", df_load["y"].cast(IntegerType()))

print(df_load.count())
df_load.head(10)

10000000


[Row(x1=0.8171528684921276, x2=0.24987197734186745, y=1), Row(x1=-0.5184817239482779, x2=0.7912845668866769, y=0), Row(x1=1.0296384467915982, x2=-0.1947061822599957, y=0), Row(x1=-0.20666705157261883, x2=-0.8435154054082349, y=1), Row(x1=0.39311970409126296, x2=-0.6849043070449546, y=1), Row(x1=0.1671154341000823, x2=-0.7596441718756154, y=1), Row(x1=0.124653679176714, x2=-0.7204281190897744, y=1), Row(x1=-0.7800525661517679, x2=-0.6838101624023487, y=0), Row(x1=-1.0101224339480892, x2=0.12096830580580976, y=0), Row(x1=-0.04453222401484804, x2=-1.0130259056466904, y=0)]

## Create pipeline
Assign features and dependent variable.<br>
Build random forest model

In [4]:
featureCols = ['x1', 'x2']
assembler_features = VectorAssembler(inputCols=featureCols, outputCol='features')
labelIndexer = StringIndexer(inputCol='y', outputCol="label")

dfX = [assembler_features, labelIndexer]
pipeline = Pipeline(stages=dfX)

allData = pipeline.fit(df_load).transform(df_load)

trainingData, testData = allData.randomSplit([0.8, 0.2], seed=0)
rf = RF(labelCol='label', featuresCol='features', numTrees=50)
fit = rf.fit(trainingData)
transformed = fit.transform(testData)
results = transformed.select(['probability', 'label'])

print(results.count())
results.head(10)

1998347


[Row(probability=DenseVector([0.9999, 0.0001]), label=0.0), Row(probability=DenseVector([0.9999, 0.0001]), label=0.0), Row(probability=DenseVector([0.9999, 0.0001]), label=0.0), Row(probability=DenseVector([0.9998, 0.0002]), label=0.0), Row(probability=DenseVector([0.9998, 0.0002]), label=0.0), Row(probability=DenseVector([0.9999, 0.0001]), label=0.0), Row(probability=DenseVector([0.9998, 0.0002]), label=0.0), Row(probability=DenseVector([0.9999, 0.0001]), label=0.0), Row(probability=DenseVector([1.0, 0.0]), label=0.0), Row(probability=DenseVector([0.9999, 0.0001]), label=0.0)]

## Create truth table
Extract probability from Dense Vector<br>
Create group by using Spark. This scales, but may not with python dataframes.

In [5]:
validation = results.select(['label', (ith("probability", lit(1)) > 0.5).cast('integer').alias('prediction') ])
truth_table = validation.groupBy(['label', 'prediction']).count().orderBy(['label', 'prediction'])

tt = truth_table.toPandas()
tp = tt[((tt.label == 1) & (tt.prediction == 1))]['count'].values[0]
fp = tt[((tt.label == 0) & (tt.prediction == 1))]['count'].values[0]
fn = tt[((tt.label == 1) & (tt.prediction == 0))]['count'].values[0]
tn = tt[((tt.label == 0) & (tt.prediction == 0))]['count'].values[0]

accuracy = (tp + tn)/(tp + tn + fp + fn)
precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1 = (2.0 * precision*recall)/(precision+recall)

print("Out of sample accuracy =", accuracy)
print("Out of sample precision =", precision)
print("Out of sample recall =", recall)
print("Out of sample F1 =", f1)
tt

Out of sample accuracy = 0.8260552346514394
Out of sample precision = 0.7529563089959204
Out of sample recall = 0.970014586956696
Out of sample F1 = 0.8478130475480792


   label  prediction   count
0    0.0           0  682523
1    0.0           1  317672
2    1.0           0   29930
3    1.0           1  968222

## Aside
To make the model a bit of a challenege the dependent variable was a circle inside a circle.

![Sample](data_sample.png)

Nothing to do with Spark or HDFS, but here is a fun way to demonstrate how domain knowledge and improved data/transformation can be more valuable than model parameters.

In [8]:
df_load = df_load.withColumn("x3", (df_load["x1"]**2 + df_load["x2"]**2).cast(DoubleType()))

featureCols = ['x1', 'x2', 'x3']
assembler_features = VectorAssembler(inputCols=featureCols, outputCol='features')
labelIndexer = StringIndexer(inputCol='y', outputCol="label")

dfX = [assembler_features, labelIndexer]
pipeline = Pipeline(stages=dfX)

allData = pipeline.fit(df_load).transform(df_load)

trainingData, testData = allData.randomSplit([0.8, 0.2], seed=0)
lr = LR(labelCol='label', featuresCol='features')
fit = lr.fit(trainingData)
transformed = fit.transform(testData)
results = transformed.select(['probability', 'label'])

validation = results.select(['label', (ith("probability", lit(1)) > 0.5).cast('integer').alias('prediction') ])
truth_table = validation.groupBy(['label', 'prediction']).count().orderBy(['label', 'prediction'])

tt = truth_table.toPandas()
tp = tt[((tt.label == 1) & (tt.prediction == 1))]['count'].values[0]
fp = tt[((tt.label == 0) & (tt.prediction == 1))]['count'].values[0]
fn = tt[((tt.label == 1) & (tt.prediction == 0))]['count'].values[0]
tn = tt[((tt.label == 0) & (tt.prediction == 0))]['count'].values[0]

accuracy = (tp + tn)/(tp + tn + fp + fn)
precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1 = (2.0 * precision*recall)/(precision+recall)

print("Out of sample accuracy =", accuracy)
print("Out of sample precision =", precision)
print("Out of sample recall =", recall)
print("Out of sample F1 =", f1)
tt

Out of sample accuracy = 0.9770945686609983
Out of sample precision = 0.9765407105036412
Out of sample recall = 0.9776276559081182
Out of sample F1 = 0.9770838809160841


   label  prediction   count
0    0.0           0  976753
1    0.0           1   23442
2    1.0           0   22331
3    1.0           1  975821

Granted this data is clearly rigged and the knowledge of how it's rigged is exploited for the transform. <br>
It's worth noting, however, because many real business problems can also be solved with greater intuition more than more data or model tuning.