## Decision Trees and Random Forest Classifier with PySpark

### Importing Requirements

In [50]:
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import *
import pandas as pd

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:250% !important; }</style>"))

In [51]:
sqlContext = SQLContext(sc)

schema = StructType([ \
    StructField("ID", StringType(), True), \
    StructField("Gender", StringType(), True), \
    StructField("SeniorCitizen", BooleanType(), True), \
    StructField("Partner", StringType(), True), \
    StructField("Dependents", StringType(), True), \
    StructField("Tenure", StringType(), True), \
    StructField("PhoneService", StringType(), True), \
    StructField("MultipleLines", BooleanType(), True), \
    StructField("InternetService", StringType(), True), \
    StructField("OnlineSecurity", BooleanType(), True), \
    StructField("OnlineBackup", BooleanType(), True), \
    StructField("DeviceProtection", BooleanType(), True), \
    StructField("TechSupport", BooleanType(), True), \
    StructField("StreamingTV", BooleanType(), True), \
    StructField("StreamingMovies", BooleanType(), True), \
    StructField("Contract", StringType(), True), \
    StructField("PaperlessBilling", StringType(), True), \
    StructField("PaymentMethod", StringType(), True), \
    StructField("MonthlyCharges", DoubleType(),True), \
    StructField("TotalCharges", DoubleType(), True), \
    StructField("Churn", StringType(), True)])

churn_data = spark.read.csv("Telco_Churn.csv", schema=schema) 

In [52]:
df=pd.read_csv('Telco_Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [53]:
df['SeniorCitizen']=df['SeniorCitizen'].replace({0:False, 1:True})
df['Partner']=df['Partner'].replace({'No':False, 'Yes':True})
df['Dependents']=df['Dependents'].replace({'No':False, 'Yes':True})
df['PhoneService']=df['PhoneService'].replace({'No':False, 'Yes':True})
df['MultipleLines']=df['MultipleLines'].replace({'No':False, 'Yes':True,'No phone service':False})
df['OnlineSecurity']=df['OnlineSecurity'].replace({'No':False, 'Yes':True,'No internet service':False})
df['OnlineBackup']=df['OnlineBackup'].replace({'No':False, 'Yes':True,'No internet service':False})
df['DeviceProtection']=df['DeviceProtection'].replace({'No':False, 'Yes':True,'No internet service':False})
df['TechSupport']=df['TechSupport'].replace({'No':False, 'Yes':True,'No internet service':False})
df['StreamingTV']=df['StreamingTV'].replace({'No':False, 'Yes':True,'No internet service':False})
df['StreamingMovies']=df['StreamingMovies'].replace({'No':False, 'Yes':True,'No internet service':False})
df['PaperlessBilling']=df['PaperlessBilling'].replace({'No':False, 'Yes':True})
 
#df['Churn']=df['Churn'].replace({'No':False, 'Yes':True})
#df=df.drop(labels=None, axis=0, index=[488,752,751,750], columns=None, level=None, inplace=False, errors='raise')

In [54]:
df['TotalCharges'] = df['TotalCharges'].str.strip()
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

In [55]:
df = df.dropna(how="any")
df.shape

(7032, 21)

In [56]:
pysparkdf = spark.createDataFrame(df,schema=schema)


In [57]:
pysparkdf.show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|        ID|Gender|SeniorCitizen|Partner|Dependents|Tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|7590-VHVEG|Female|        false|   true|     false|     1|       false|        false|            DSL|         false|        true|           false|      false|      false|    

In [58]:
reduced_numeric_cols=['MonthlyCharges']

In [59]:
label_indexer = StringIndexer(inputCol = 'Churn', outputCol = 'label')
InternetService_indexer = StringIndexer(inputCol = 'InternetService', outputCol = 'InternetService_indexed')
PhoneService_indexer = StringIndexer(inputCol = 'PhoneService', outputCol = 'PhoneService_indexed')


assembler = VectorAssembler(
    inputCols=[
        "TotalCharges",
    ] + reduced_numeric_cols,
    outputCol="features",
)
#output = featureassembler.transform(pysparkdf)

In [60]:
classifier = DecisionTreeClassifier(labelCol = 'label', featuresCol = 'features')
pipeline = Pipeline(stages=[label_indexer, InternetService_indexer,PhoneService_indexer, assembler, classifier])
(train, test) = pysparkdf.randomSplit([0.9, 0.1])
model = pipeline.fit(train)

### Model Evaluation

In [61]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = model.transform(test)
evaluator = BinaryClassificationEvaluator()
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
"The AUROC is %s and the AUPR is %s." % (auroc, aupr)

'The AUROC is 0.4333862955657828 and the AUPR is 0.26958234240681367.'

## Random Forest Classifier 


In [62]:
from pyspark.ml.classification import RandomForestClassifier
RF_NUM_TREES = 3

In [63]:
classifier = RandomForestClassifier(impurity="gini",featureSubsetStrategy="auto", numTrees=RF_NUM_TREES, maxDepth = 30, labelCol = 'label', featuresCol = 'features')
pipeline = Pipeline(stages=[label_indexer, InternetService_indexer,PhoneService_indexer, assembler, classifier])
(train, test) = pysparkdf.randomSplit([0.9, 0.1])
model = pipeline.fit(train)

In [64]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = model.transform(test)
evaluator = BinaryClassificationEvaluator()
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
"The AUROC is %s and the AUPR is %s." % (auroc, aupr)

'The AUROC is 0.7760461023107934 and the AUPR is 0.5800377753171316.'

In [65]:
(predictions.select("label","prediction")).show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       1.0|
|  1.0|       1.0|
|  1.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 20 rows

