# Introduction

This project uses a random forest to predict customer churn in a telecom data set from Kaggle.

In [1]:
import numpy as np 
import pandas as pd

from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

df = pd.read_csv('data/Train_ServicesOptedFor.csv')
df2 = pd.read_csv('data/Train_Demographics.csv')
df3 = pd.read_csv('data/Train (3).csv')

Preprocess Data for machine learning. Check for null values, merge csvs, convert categories

In [2]:

customer_data = df.set_index(['CustomerID','TypeOfService']).unstack()
customer_data.columns = customer_data.columns.map('_'.join)

customer_data = customer_data.reset_index()

merge_one = pd.merge(customer_data, df2, left_on='CustomerID', right_on='HouseholdID', how='outer').drop('HouseholdID', axis=1)
final = pd.merge(merge_one, df3, on='CustomerID', how='outer')

In [3]:
#Search for NAN values
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)
        
display_all(final.isnull().sum().sort_index()/len(final))

Churn                                    0.000000
Country                                  0.000000
CustomerID                               0.000000
Education                                0.001888
Gender                                   0.000755
HasDependents                            0.000000
HasPartner                               0.000000
Retired                                  0.000000
SeviceDetails_DeviceProtection           0.000000
SeviceDetails_HasPhoneService            0.000000
SeviceDetails_InternetServiceCategory    0.000000
SeviceDetails_MultipleLines              0.000000
SeviceDetails_OnlineBackup               0.000000
SeviceDetails_OnlineSecurity             0.000000
SeviceDetails_StreamingMovies            0.000000
SeviceDetails_StreamingTelevision        0.000000
SeviceDetails_TechnicalSupport           0.000000
State                                    0.000000
dtype: float64

In [4]:
train_cats(final)

In [5]:
df, y, nas = proc_df(final, 'Churn')

In [6]:
final.columns

Index(['CustomerID', 'SeviceDetails_DeviceProtection',
       'SeviceDetails_HasPhoneService',
       'SeviceDetails_InternetServiceCategory', 'SeviceDetails_MultipleLines',
       'SeviceDetails_OnlineBackup', 'SeviceDetails_OnlineSecurity',
       'SeviceDetails_StreamingMovies', 'SeviceDetails_StreamingTelevision',
       'SeviceDetails_TechnicalSupport', 'Country', 'State', 'Retired',
       'HasPartner', 'HasDependents', 'Education', 'Gender', 'Churn'],
      dtype='object')

In [7]:
m =  RandomForestClassifier(n_jobs=-1)
m.fit(df, y)
m.score(df,y)

0.97791619479048697

In [8]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 1000  # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(final, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((4298, 17), (4298,), (1000, 17))

In [9]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [10]:
m = RandomForestClassifier(n_estimators=80, min_samples_leaf=3, max_features=0.5,  n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)

[0.32285372490850467, 0.4571651780264984, 0.89576547231270354, 0.79100000000000004]


# Conclusion

Using basic settings, with an 79% confidence we can predict whether or not a customer will leave. 
