In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, isnan, split
from pyspark.sql import Row
import pandas as pd
import numpy as np
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
spSession = SparkSession.builder.master("local").appName("Local-Session").getOrCreate()

### Loading train and test data

In [3]:
trainDF = spSession.read.csv("../dataset/train_1.csv", header = True, inferSchema = True).drop("_c0") 
testDF = spSession.read.csv("../dataset/test_1.csv", header = True, inferSchema = True).drop("_c0")  
trainDF.toPandas().describe()

Unnamed: 0,account_length,number_vmail_messages,total_day_calls,total_day_charge,total_eve_calls,total_eve_charge,total_night_charge,total_intl_calls,total_intl_charge,number_customer_service_calls,state_indexed,international_plan_indexed,churn_indexed,total_day_charge_level,number_customer_service_calls_level
count,976.0,976.0,976.0,976.0,976.0,976.0,976.0,976.0,976.0,976.0,976.0,976.0,976.0,976.0,976.0
mean,102.107582,6.877049,100.615779,32.427602,99.790984,17.452869,9.082254,4.398566,2.784047,1.851434,20.726434,0.165984,0.494877,0.313525,0.281762
std,39.452298,13.185744,20.465679,10.480309,19.927751,4.413413,2.25701,2.534674,0.764389,1.56548,14.21505,0.372257,0.50023,0.464163,0.450089
min,1.0,0.0,0.0,0.0,42.0,3.59,1.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,76.0,0.0,88.0,24.88,86.0,14.4475,7.55,3.0,2.315,1.0,8.0,0.0,0.0,0.0,0.0
50%,101.0,0.0,101.0,31.775,100.0,17.585,8.985,4.0,2.78,1.0,19.0,0.0,0.0,0.0,0.0
75%,127.0,0.0,114.0,40.43,113.0,20.4325,10.7225,6.0,3.29,3.0,32.0,0.0,1.0,1.0,1.0
max,232.0,50.0,165.0,59.64,168.0,30.91,15.97,20.0,5.4,9.0,50.0,1.0,1.0,1.0,1.0


### Vectorizing data

In [43]:
vecAssembler = VectorAssembler(inputCols=[col for col in trainDF.columns if col != "churn_indexed"], 
                               outputCol="features")
trainDF2 = vecAssembler.transform(trainDF)
testDF2 = vecAssembler.transform(testDF)

trainDF2 = trainDF2["churn_indexed", "features"]
testDF2 = testDF2["churn_indexed", "features"]

### Training model - Gradient Boosting Classifier

In [39]:
gbt = GBTClassifier(labelCol="churn_indexed", featuresCol="features", maxIter=10)
model = gbt.fit(trainDF2)

### Predictions

In [46]:
predictions = model.transform(testDF2)
predictions.select("prediction", "churn_indexed", "features").show(5)

+----------+-------------+--------------------+
|prediction|churn_indexed|            features|
+----------+-------------+--------------------+
|       0.0|          0.0|[101.0,0.0,123.0,...|
|       1.0|          0.0|[137.0,0.0,86.0,3...|
|       1.0|          0.0|[103.0,29.0,95.0,...|
|       0.0|          0.0|[99.0,0.0,123.0,3...|
|       0.0|          0.0|[108.0,0.0,78.0,3...|
+----------+-------------+--------------------+
only showing top 5 rows



### Confusion Matrix

In [50]:
pd.DataFrame(confusion_matrix(predictions.toPandas()["churn_indexed"], predictions.toPandas()["prediction"]), 
             columns = ["yes_true","no_true"], index = ["yes_pred","no_pred"])

Unnamed: 0,yes_true,no_true
yes_pred,1337,106
no_pred,46,178


### Accuracy

In [53]:
print("Model's accuracy: " + 
      str(100*round(accuracy_score(predictions.toPandas()["churn_indexed"], 
                                   predictions.toPandas()["prediction"]),2)) + "%")

Model's accuracy: 91.0%
