In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext(appName='teleco-customer-churn')
spark = SparkSession.builder.getOrCreate()

This notebook will work more as a guide to help us develop the script that will run on GCP, and since we will run it localy, I'll just grab a sample of around 1000 records, so we can do it faster.

In [8]:
customers_table = spark.read.csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv', header='true', inferSchema='true')
customers_table_sample = customers_table.sample(withReplacement=False, fraction=0.15, seed=42)

In [9]:
customers_table_sample.count()

1102

We can start by applying the same steps we did on the analysis notebook to treat missing values and standardize column names.

In [10]:
customers_table_sample = customers_table_sample.withColumnRenamed('gender', 'Gender').withColumnRenamed('tenure', 'Tenure').withColumnRenamed('customerId', 'CustomerId')
customers_table_sample = customers_table_sample.replace(subset='TotalCharges', to_replace=' ', value='0.00')
customers_table_sample = customers_table_sample.withColumn('TotalCharges', customers_table_sample.TotalCharges.cast('double'))

In [12]:
customers_table_sample.show()

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|CustomerId|Gender|SeniorCitizen|Partner|Dependents|Tenure|PhoneService|   MultipleLines|InternetService|     OnlineSecurity|       OnlineBackup|   DeviceProtection|        TechSupport|        StreamingTV|    StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|6713-OKOMC|Female|            0|     No|        No|    10|  

In [15]:
customers_table_sample.distinct()

1102

### Pre-processing
---
First, we'll drop de Id column, since it doesn't present any predictive value. Then we'll convert the categorical string variables into numeric variables.

In [37]:
customers_table_sample = customers_table_sample.drop('CustomerId')

In [62]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression

In [59]:
string_variables = [variable[0] for variable in customers_table_sample.dtypes if variable[1] == 'string']
output_string_variables = [variable+'_numeric' for variable in string_variables]
rename_columns_dic = {output_string_variables[index]:string_variables[index] for index in range(len(string_variables))}

indexer_model = StringIndexer(inputCols=string_variables, outputCols=output_string_variables)
indexer_fitted = indexer_model.fit(customers_table_sample)
numeric_customers_table_sample = indexer_fitted.transform(customers_table_sample)

numeric_customers_table_sample = numeric_customers_table_sample.drop(*string_variables)
numeric_customers_table_sample = numeric_customers_table_sample.withColumnsRenamed(rename_columns_dic)

numeric_customers_table_sample.show()

+-------------+------+--------------+------------+------+-------+----------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+-----+
|SeniorCitizen|Tenure|MonthlyCharges|TotalCharges|Gender|Partner|Dependents|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|Churn|
+-------------+------+--------------+------------+------+-------+----------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+-----+
|            0|    10|         29.75|       301.9|   0.0|    0.0|       0.0|         1.0|          2.0|            1.0|           1.0|         0.0|             0.0|        0.0|        1.0|            0.0|     0.0|             1.0|          3.0|  0

In [61]:
numeric_customers_table_sample.dtypes

[('SeniorCitizen', 'int'),
 ('Tenure', 'int'),
 ('MonthlyCharges', 'double'),
 ('TotalCharges', 'double'),
 ('Gender', 'double'),
 ('Partner', 'double'),
 ('Dependents', 'double'),
 ('PhoneService', 'double'),
 ('MultipleLines', 'double'),
 ('InternetService', 'double'),
 ('OnlineSecurity', 'double'),
 ('OnlineBackup', 'double'),
 ('DeviceProtection', 'double'),
 ('TechSupport', 'double'),
 ('StreamingTV', 'double'),
 ('StreamingMovies', 'double'),
 ('Contract', 'double'),
 ('PaperlessBilling', 'double'),
 ('PaymentMethod', 'double'),
 ('Churn', 'double')]

Cool, we got all the variables set as numeric values. We will now create our first model so we can use it as a baseline. I don't expect it to be the most accurate, but after that we can dig more into other pre-processing techniques.

Before we can create our model, we need to do a train-test split. Spark doesn't have a easy function to do so (there is RandomSplit() but I don't like the way that it work, [click here to find more](https://sergei-ivanov.medium.com/why-you-should-not-use-randomsplit-in-pyspark-to-split-data-into-train-and-test-58576d539a36)), so we'll have to do it by hand. 

In [81]:
from random import seed, randint
seed(42)
train_test_index = randint(0,99)

In [83]:
numeric_customers_table_sample.withColumn('train_text_index', randint(0,1)).show()

PySparkTypeError: [NOT_COLUMN] Argument `col` should be a Column, got int.

In [68]:
train_table, test_table = numeric_customers_table_sample.randomSplit([0.7,0.3], seed=42)
train_table.show()

+-------------+------+--------------+------------+------+-------+----------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+-----+
|SeniorCitizen|Tenure|MonthlyCharges|TotalCharges|Gender|Partner|Dependents|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|Churn|
+-------------+------+--------------+------------+------+-------+----------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+-----+
|            0|     0|         73.35|         0.0|   0.0|    1.0|       1.0|         0.0|          1.0|            1.0|           0.0|         1.0|             1.0|        1.0|        0.0|            0.0|     1.0|             1.0|          3.0|  0

In [69]:
test_table.tail(10)

[Row(SeniorCitizen=1, Tenure=60, MonthlyCharges=101.4, TotalCharges=6176.6, Gender=0.0, Partner=0.0, Dependents=0.0, PhoneService=0.0, MultipleLines=1.0, InternetService=0.0, OnlineSecurity=1.0, OnlineBackup=0.0, DeviceProtection=0.0, TechSupport=0.0, StreamingTV=0.0, StreamingMovies=1.0, Contract=2.0, PaperlessBilling=0.0, PaymentMethod=2.0, Churn=0.0),
 Row(SeniorCitizen=1, Tenure=62, MonthlyCharges=97.95, TotalCharges=5936.55, Gender=0.0, Partner=1.0, Dependents=0.0, PhoneService=0.0, MultipleLines=1.0, InternetService=0.0, OnlineSecurity=0.0, OnlineBackup=1.0, DeviceProtection=1.0, TechSupport=0.0, StreamingTV=0.0, StreamingMovies=0.0, Contract=2.0, PaperlessBilling=0.0, PaymentMethod=1.0, Churn=0.0),
 Row(SeniorCitizen=1, Tenure=64, MonthlyCharges=111.6, TotalCharges=7099.0, Gender=1.0, Partner=1.0, Dependents=1.0, PhoneService=0.0, MultipleLines=1.0, InternetService=0.0, OnlineSecurity=1.0, OnlineBackup=0.0, DeviceProtection=1.0, TechSupport=1.0, StreamingTV=0.0, StreamingMovies=

In [None]:
lr_base_model = LogisticRegression()