# PySpark with HDFS
Import libraries

In [1]:
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.functions import lit, udf
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier as RF, LogisticRegression as LR
import warnings
warnings.filterwarnings('ignore')

Waiting for a Spark session to start...

Waiting for a Spark session to start...

In [2]:
def ith_(v, i):
    try:
        return float(v[i])
    except ValueError:
        return None

ith = udf(ith_, DoubleType())

## Load data from HDFS
Load data and cast strings to integers

In [3]:
df_load = spark.read.csv('hdfs:///mydata.csv', header='true')
df_load = df_load.withColumn('x1', df_load['x1'].cast(DoubleType()))
df_load = df_load.withColumn('x2', df_load['x2'].cast(DoubleType()))
df_load = df_load.withColumn('x3', df_load['x3'].cast(DoubleType()))
df_load = df_load.withColumn('x4', df_load['x4'].cast(DoubleType()))
df_load = df_load.withColumn('x5', df_load['x5'].cast(DoubleType()))
df_load = df_load.withColumn('y', df_load['y'].cast(IntegerType()))

print(df_load.count())
df_load.head(10)

1000000


[Row(x1=0.573615760542304, x2=-2.171103312705866, x3=0.33234832858709734, x4=0.5760633009829951, x5=0.3729207146549865, y=1), Row(x1=1.8713740240113563, x2=-1.9397738416892814, x3=1.2198104328861257, x4=-0.40002603975409445, x5=-0.607469321569246, y=0), Row(x1=0.014450262868191439, x2=1.0224354654164514, x3=0.03880004750212669, x4=-0.5093390663383718, x5=-0.47011830877007615, y=1), Row(x1=1.5028009781945337, x2=-0.228723004531617, x3=-0.7155399060064932, x4=-1.355270244500279, x5=-0.5376843716520238, y=0), Row(x1=-0.10719587543140312, x2=-0.39417886738578733, x3=-0.8481786345419119, x4=1.0933419513740712, x5=0.7800847487554594, y=1), Row(x1=-0.4695187927230554, x2=0.7052725236230953, x3=0.6574679910558906, x4=-0.8009112522259348, x5=-1.0236797375976776, y=0), Row(x1=1.096753842921396, x2=0.546347657969521, x3=-0.03218979568966823, x4=-0.7534462749672464, x5=0.5170169803217873, y=0), Row(x1=1.444313845827984, x2=-0.5534152717021814, x3=-0.02582451137750769, x4=-0.6179044638322294, x5=0.

## Create pipeline
Assign features and dependent variable.<br>
Build random forest model

In [11]:
featureCols = ['x1', 'x2', 'x3', 'x4', 'x5']
assembler_features = VectorAssembler(inputCols=featureCols, outputCol='features')
labelIndexer = StringIndexer(inputCol='y', outputCol='label')

dfX = [assembler_features, labelIndexer]
pipeline = Pipeline(stages=dfX)

allData = pipeline.fit(df_load).transform(df_load)

trainingData, testData = allData.randomSplit([0.8, 0.2], seed=0)
rf = RF(labelCol='label', featuresCol='features', numTrees=20, featureSubsetStrategy='all')
fit = rf.fit(trainingData)
transformed = fit.transform(testData)
results = transformed.select(['probability', 'label'])

print(results.count())
results.head(10)

199994


[Row(probability=DenseVector([0.4334, 0.5666]), label=0.0), Row(probability=DenseVector([0.9851, 0.0149]), label=0.0), Row(probability=DenseVector([0.3276, 0.6724]), label=1.0), Row(probability=DenseVector([0.8264, 0.1736]), label=0.0), Row(probability=DenseVector([0.4334, 0.5666]), label=0.0), Row(probability=DenseVector([0.4274, 0.5726]), label=0.0), Row(probability=DenseVector([0.4274, 0.5726]), label=1.0), Row(probability=DenseVector([0.3276, 0.6724]), label=1.0), Row(probability=DenseVector([0.332, 0.668]), label=0.0), Row(probability=DenseVector([0.366, 0.634]), label=1.0)]

## Create truth table
Extract probability from Dense Vector<br>
Create group by using Spark. This scales, but may not with python dataframes.

In [12]:
validation = results.select(['label', (ith("probability", lit(1)) > 0.5).cast('integer').alias('prediction')])
truth_table = validation.groupBy(['label', 'prediction']).count().orderBy(['label', 'prediction'])

tt = truth_table.toPandas()
tp = tt[((tt.label == 1) & (tt.prediction == 1))]['count'].values[0]
fp = tt[((tt.label == 0) & (tt.prediction == 1))]['count'].values[0]
fn = tt[((tt.label == 1) & (tt.prediction == 0))]['count'].values[0]
tn = tt[((tt.label == 0) & (tt.prediction == 0))]['count'].values[0]

accuracy = (tp + tn)/(tp + tn + fp + fn)
precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1 = (2.0 * precision*recall)/(precision+recall)

print("Out of sample accuracy =", accuracy)
print("Out of sample precision =", precision)
print("Out of sample recall =", recall)
print("Out of sample F1 =", f1)
tt

Out of sample accuracy = 0.6627648829464884
Out of sample precision = 0.6000147791441643
Out of sample recall = 0.9750720518773517
Out of sample F1 = 0.7428893827744082


   label  prediction  count
0    0.0           0  35112
1    0.0           1  64954
2    1.0           0   2491
3    1.0           1  97437

## Aside
To make the model a bit of a challenege the dependent variable was a circle inside a circle.

![Sample](data_sample.png)

Nothing to do with Spark or HDFS, but here is a fun way to demonstrate how domain knowledge and improved data/transformation can be more valuable than model parameters.

In [7]:
df_load = df_load.withColumn("x6", ((df_load["x1"] + df_load["x2"] + df_load["x3"] + df_load["x4"])**2 + df_load["x5"]**2).cast(DoubleType()))

featureCols = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6']
assembler_features = VectorAssembler(inputCols=featureCols, outputCol='features')
labelIndexer = StringIndexer(inputCol='y', outputCol="label")

dfX = [assembler_features, labelIndexer]
pipeline = Pipeline(stages=dfX)

allData = pipeline.fit(df_load).transform(df_load)

trainingData, testData = allData.randomSplit([0.8, 0.2], seed=0)
lr = LR(labelCol='label', featuresCol='features')
fit = lr.fit(trainingData)
transformed = fit.transform(testData)
results = transformed.select(['probability', 'label'])

validation = results.select(['label', (ith("probability", lit(1)) > 0.5).cast('integer').alias('prediction')])
truth_table = validation.groupBy(['label', 'prediction']).count().orderBy(['label', 'prediction'])

tt = truth_table.toPandas()
tp = tt[((tt.label == 1) & (tt.prediction == 1))]['count'].values[0]
fp = tt[((tt.label == 0) & (tt.prediction == 1))]['count'].values[0]
fn = tt[((tt.label == 1) & (tt.prediction == 0))]['count'].values[0]
tn = tt[((tt.label == 0) & (tt.prediction == 0))]['count'].values[0]

accuracy = (tp + tn)/(tp + tn + fp + fn)
precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1 = (2.0 * precision*recall)/(precision+recall)

print("Out of sample accuracy =", accuracy)
print("Out of sample precision =", precision)
print("Out of sample recall =", recall)
print("Out of sample F1 =", f1)
tt

Out of sample accuracy = 0.9776193285798574
Out of sample precision = 0.9763644348624586
Out of sample recall = 0.9789048114642542
Out of sample F1 = 0.9776329728757321


   label  prediction  count
0    0.0           0  97698
1    0.0           1   2368
2    1.0           0   2108
3    1.0           1  97820

Granted this data is clearly rigged and the knowledge of how it's rigged is exploited for the transform. <br>
It's worth noting, however, because many real business problems can also be solved with greater intuition more than more data or model tuning.