In [221]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import plot_confusion_matrix, confusion_matrix 
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion


import xgboost as xgb

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

In [222]:
df = pd.read_csv('data/telco_churn_data.csv')

df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [223]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   phone number            3333 non-null   object 
 4   international plan      3333 non-null   object 
 5   voice mail plan         3333 non-null   object 
 6   number vmail messages   3333 non-null   int64  
 7   total day minutes       3333 non-null   float64
 8   total day calls         3333 non-null   int64  
 9   total day charge        3333 non-null   float64
 10  total eve minutes       3333 non-null   float64
 11  total eve calls         3333 non-null   int64  
 12  total eve charge        3333 non-null   float64
 13  total night minutes     3333 non-null   float64
 14  total night calls       3333 non-null   

In [224]:
df['churn'] = df['churn'].astype('int')

In [225]:
df.describe()

Unnamed: 0,account length,area code,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,437.182418,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856,0.144914
std,39.822106,42.37129,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491,0.352067
min,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0,0.0
25%,74.0,408.0,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0,0.0
50%,101.0,415.0,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0,0.0
75%,127.0,510.0,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0,0.0
max,243.0,510.0,51.0,350.8,165.0,59.64,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0,1.0


In [226]:
df.isna().sum()

state                     0
account length            0
area code                 0
phone number              0
international plan        0
voice mail plan           0
number vmail messages     0
total day minutes         0
total day calls           0
total day charge          0
total eve minutes         0
total eve calls           0
total eve charge          0
total night minutes       0
total night calls         0
total night charge        0
total intl minutes        0
total intl calls          0
total intl charge         0
customer service calls    0
churn                     0
dtype: int64

In [227]:
df['area code'] = df['area code'].astype('str')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   object 
 3   phone number            3333 non-null   object 
 4   international plan      3333 non-null   object 
 5   voice mail plan         3333 non-null   object 
 6   number vmail messages   3333 non-null   int64  
 7   total day minutes       3333 non-null   float64
 8   total day calls         3333 non-null   int64  
 9   total day charge        3333 non-null   float64
 10  total eve minutes       3333 non-null   float64
 11  total eve calls         3333 non-null   int64  
 12  total eve charge        3333 non-null   float64
 13  total night minutes     3333 non-null   float64
 14  total night calls       3333 non-null   

In [228]:
df.churn.value_counts()

0    2850
1     483
Name: churn, dtype: int64

In [229]:
df_obj_cols = df.select_dtypes(include='object')

for col in df_obj_cols:
    print(f'{col.title()} has the following different values: {df[col].unique()}')
    print(f'There are {df[col].nunique()} options for the {col.title()} feature')
    

State has the following different values: ['KS' 'OH' 'NJ' 'OK' 'AL' 'MA' 'MO' 'LA' 'WV' 'IN' 'RI' 'IA' 'MT' 'NY'
 'ID' 'VT' 'VA' 'TX' 'FL' 'CO' 'AZ' 'SC' 'NE' 'WY' 'HI' 'IL' 'NH' 'GA'
 'AK' 'MD' 'AR' 'WI' 'OR' 'MI' 'DE' 'UT' 'CA' 'MN' 'SD' 'NC' 'WA' 'NM'
 'NV' 'DC' 'KY' 'ME' 'MS' 'TN' 'PA' 'CT' 'ND']
There are 51 options for the State feature
Area Code has the following different values: ['415' '408' '510']
There are 3 options for the Area Code feature
Phone Number has the following different values: ['382-4657' '371-7191' '358-1921' ... '328-8230' '364-6381' '400-4344']
There are 3333 options for the Phone Number feature
International Plan has the following different values: ['no' 'yes']
There are 2 options for the International Plan feature
Voice Mail Plan has the following different values: ['yes' 'no']
There are 2 options for the Voice Mail Plan feature


Looking at the above columns, it is clear that while we have been give a phone number, since these are all unique, this isn't going to have any influence over whether a customer has churned or not, therefore the next steps will be to drop this column.

After this, looking at our brief exploratory data above there are a few different steps to complete before the data is ready to be passed into a model. These are:
- One hot encode all of the categorical columns
- Normalize all of the numerical columns
- Account for the class imbalance within the target data by using SMOTE

Once these steps have been completed the data should be ready to pass into our proposed models

In [230]:
X = df.drop(columns=['churn', 'phone number'], axis=1)

y = df.churn

In [231]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state=42)

In [232]:
y_train.value_counts()

0    1995
1     338
Name: churn, dtype: int64

## Pre-Processing of the data
In this section we will work on:
- One hot encoding the relevant columns (only for the training dataset)
- Standardise the numerical columns

In [233]:
def additional_churn_features(X):
    
    X['total_calls'] = X['total day calls'] + X['total eve calls'] + X['total night calls'] + X['total intl calls']
    X['total_charges'] = X['total day charge'] + X['total eve charge'] + X['total night charge'] + X['total intl charge']
    X['total_minutes'] = X['total day minutes'] + X['total eve minutes'] + X['total night minutes'] + X['total intl minutes']
    X['pct_intl_calls'] = X['total intl calls'] / X['total_calls']
    X['pct_domestic_calls'] = 1 - X['pct_intl_calls']
    X['pct_intl_minutes'] = X['total intl minutes'] / X['total_minutes']
    X['pct_intl_charges'] = X['total intl charge'] / X['total_charges']
    X['avg_mins_per_call_day'] = X['total day minutes'] / X['total day calls']
    X['avg_mins_per_call_eve'] = X['total eve minutes'] / X['total eve calls']
    X['avg_mins_per_call_night'] = X['total night minutes'] / X['total night calls']
    
    return X

In [234]:
num_col_names = ['account length',
                 'number vmail messages',
                 'total day minutes',
                 'total day calls',
                 'total day charge',
                 'total eve minutes',
                 'total eve calls',
                 'total eve charge',
                 'total night minutes',
                 'total night calls',
                 'total night charge',
                 'total intl minutes',
                 'total intl calls',
                 'total intl charge',
                 'customer service calls',
                 'total_calls',
                 'total_charges',
                 'total_minutes',
                 'pct_intl_calls',
                 'pct_domestic_calls',
                 'pct_intl_minutes',
                 'pct_intl_charges',
                 'avg_mins_per_call_day',
                 'avg_mins_per_call_eve',
                 'avg_mins_per_call_night']

In [241]:
def preprocessing_pipeline(X, model=DecisionTreeClassifier):
      
      # feature engineering
    feat_eng_transformer = FunctionTransformer(additional_churn_features)
    
    # OHE
    cat_col_transformer = ColumnTransformer(transformers=[
    ("ohe", OneHotEncoder(handle_unknown='ignore', sparse=True), ['area code', 'international plan',
                                                                    'voice mail plan'])], remainder="passthrough")
    
    feature_union = FeatureUnion(transformer_list=[
      ("encoded_features", cat_col_transformer),
       ("engineered_features", feat_eng_transformer)])
    
     # StandardScaler
    #init_pipe = Pipeline(steps=[
    ##("feature_union", feature_union),
    #("scale", StandardScaler())])
    
     # SMOTE & StandardScaler
    imb_pipe = ImPipeline(steps=[('feat_union', feature_union),
                                ('sscaler', StandardScaler()),
                             ('smote', SMOTE(random_state=42)),
                                ('model', model(random_state=42))])
    
    

  #  transformed_data = imb_pipe.fit_transform(X)
    
  #  encoder = cat_col_transformer.named_transformers_["ohe"]
  #  category_labels = encoder.categories_[0]
  #  all_cols = list(category_labels) + num_col_names
   
    return imb_pipe #imb_pipe

In [242]:
result = preprocessing_pipeline(X_train)

result

Pipeline(steps=[('feat_union',
                 FeatureUnion(transformer_list=[('encoded_features',
                                                 ColumnTransformer(remainder='passthrough',
                                                                   transformers=[('ohe',
                                                                                  OneHotEncoder(handle_unknown='ignore'),
                                                                                  ['area '
                                                                                   'code',
                                                                                   'international '
                                                                                   'plan',
                                                                                   'voice '
                                                                                   'mail '
                                         

In [243]:
result.fit_transform(X_train).shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['total_calls'] = X['total day calls'] + X['total eve calls'] + X['total night calls'] + X['total intl calls']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['total_charges'] = X['total day charge'] + X['total eve charge'] + X['total night charge'] + X['total intl charge']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

ValueError: could not convert string to float: 'RI'

In [207]:
result

ColumnTransformer(remainder='passthrough',
                  transformers=[('ohe',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['area code', 'international plan',
                                  'voice mail plan'])])

In [218]:
X_train_cat = X_train.select_dtypes(include='object')

ohe = OneHotEncoder(categories="auto",handle_unknown='ignore', sparse=False)

X_train_cat_encoded = ohe.fit_transform(X_train_cat)
X_train_cat_encoded.shape

(2333, 0)

In [219]:
X_train_cat_encoded = pd.DataFrame(
    X_train_cat_encoded,
    columns=ohe.get_feature_names(),
    index=X_train.index
)

In [220]:
X_train.drop(columns=X_train_cat.columns.values, axis=1, inplace=True)
X_train = pd.concat([X_train_cat_encoded, X_train], axis=1)

X_train

Unnamed: 0,x0_AK,x0_AL,x0_AR,x0_AZ,x0_CA,x0_CO,x0_CT,x0_DC,x0_DE,x0_FL,...,total_calls,total_charges,total_minutes,pct_intl_calls,pct_domestic_calls,pct_intl_minutes,pct_intl_charges,avg_mins_per_call_day,avg_mins_per_call_eve,avg_mins_per_call_night
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,329,57.67,571.3,0.024316,0.975684,0.022930,0.061384,1.172269,2.755238,1.334021
2468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,306,60.01,650.3,0.006536,0.993464,0.006612,0.019330,1.146875,2.975000,2.670000
1844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,296,56.33,599.4,0.010135,0.989865,0.019186,0.055210,1.161538,2.397059,2.804054
3187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,259,70.27,702.2,0.015444,0.984556,0.007690,0.020777,5.297500,3.610526,1.514388
3083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,387,69.77,707.0,0.012920,0.987080,0.017680,0.048445,2.180556,0.995105,2.417557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,312,60.90,560.0,0.019231,0.980769,0.016071,0.039901,2.046296,1.279661,2.237500
2165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,319,61.33,605.9,0.009404,0.990596,0.021456,0.057231,1.662136,2.012397,1.936957
2988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,234,72.22,668.9,0.021368,0.978632,0.017940,0.044863,2.701042,1.806186,6.177778
179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,346,73.36,725.6,0.000000,1.000000,0.000000,0.000000,1.902459,2.609821,1.796429


In [157]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2333 entries, 2016 to 3174
Data columns (total 83 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   x0_AK                    2333 non-null   float64
 1   x0_AL                    2333 non-null   float64
 2   x0_AR                    2333 non-null   float64
 3   x0_AZ                    2333 non-null   float64
 4   x0_CA                    2333 non-null   float64
 5   x0_CO                    2333 non-null   float64
 6   x0_CT                    2333 non-null   float64
 7   x0_DC                    2333 non-null   float64
 8   x0_DE                    2333 non-null   float64
 9   x0_FL                    2333 non-null   float64
 10  x0_GA                    2333 non-null   float64
 11  x0_HI                    2333 non-null   float64
 12  x0_IA                    2333 non-null   float64
 13  x0_ID                    2333 non-null   float64
 14  x0_IL                

## Scaling numerical features

In [None]:
scaler = StandardScaler()

scaled_data = scaler.fit_transform(X_train)
scaler.transform(X_test)

## Pipeline Building

In [None]:
pipeline = impipeline(steps = [])