## Swan Consulting Feature engineering and Model Creation 

In this workbook, we will explore and engineer the features from our Swan Consulting Customer Dataset, then move onto create a DecisionTreeClassifier and evaluate it's performance

In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

In [2]:
def apr(y_pred, y_real):
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)
    
    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

## Data Import and Exploration

In [3]:
swan = pd.read_excel("/Users/zachgolant/Desktop/DF/1 - Project Data.xlsx")
swan = swan.copy()
swan.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,Competitor had better devices


In [4]:
swan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         7043 non-null   object 
 1   Count              7043 non-null   int64  
 2   Country            7043 non-null   object 
 3   State              7043 non-null   object 
 4   City               7043 non-null   object 
 5   Zip Code           7043 non-null   int64  
 6   Lat Long           7043 non-null   object 
 7   Latitude           7043 non-null   float64
 8   Longitude          7043 non-null   float64
 9   Gender             7043 non-null   object 
 10  Senior Citizen     7043 non-null   object 
 11  Partner            7043 non-null   object 
 12  Dependents         7043 non-null   object 
 13  Tenure Months      7043 non-null   int64  
 14  Phone Service      7043 non-null   object 
 15  Multiple Lines     7043 non-null   object 
 16  Internet Service   7043 

In [5]:
swan.loc[:,'Tenure Months':'Total Charges']

Unnamed: 0,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges
0,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
1,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65
2,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5
3,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.80,3046.05
4,49,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.70,5036.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,72,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),21.15,1419.4
7039,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5
7040,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9
7041,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45


## Feature Engineering

In [6]:
def yes_no(column):
    x=column
    if x=='Yes':
        return 1
    else:
        return 0
        
products = ['Product dissatisfaction', 'Network reliability', 'Lack of self-service on Website', 'Limited range of services', 'Long distance charges', 'Lack of affordable download/upload speed' ]
swan_services = ['Attitude of support person', 'Attitude of service provider',   'Price too high', 'Service dissatisfaction',  'Extra data charges', 'Poor expertise of phone support', 'Poor expertise of online support']
competitor_services = ['Competitor offered higher download speeds', 'Competitor offered more data', 'Competitor made better offer', 'Competitor had better devices']

def churn_reason(reason):
    if reason in products:
        return 'Issue with products' 
    elif reason in swan_services:
        return 'Issue with Swan Services' 
    elif reason in competitor_services:
        return 'Competitor offered better services'
    else:
        return 'Other'

swan=swan[swan['Total Charges']!=' '] #Some customers are in the database but never actually used the product, also have Tenure=0

In [7]:
def feature_eng(x):
    df=x.copy()
    yes_no_columns=['Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines',
                    'Online Security', 'Online Backup', 'Device Protection', 
                    'Tech Support', 'Streaming TV', 'Streaming Movies', 'Paperless Billing']
    for column in yes_no_columns:
        df[column]=df[column].apply(yes_no) 
    encoding_columns=['Contract','Payment Method', 'Internet Service']
    encoding_prefix=['Contract','Payment', 'Internet']
    df= pd.get_dummies(data=df, 
                       columns=encoding_columns, 
                       prefix=encoding_prefix, 
                       dtype=int)
    df.Gender = df.Gender.map({'Male':0, 'Female':1})

    return df

In [8]:
feature_eng(swan).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   CustomerID                         7032 non-null   object 
 1   Count                              7032 non-null   int64  
 2   Country                            7032 non-null   object 
 3   State                              7032 non-null   object 
 4   City                               7032 non-null   object 
 5   Zip Code                           7032 non-null   int64  
 6   Lat Long                           7032 non-null   object 
 7   Latitude                           7032 non-null   float64
 8   Longitude                          7032 non-null   float64
 9   Gender                             7032 non-null   int64  
 10  Senior Citizen                     7032 non-null   int64  
 11  Partner                            7032 non-null   int64

Before we can start fitting models, it is worth noting that there is a massive class imbalance in the data. There are approximately 3 times as many non-chruners as chruners, and so we need to employ some resampling methods in order to fit a robust model.

In [45]:


feature_cols=[
        'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Online Security', 'Online Backup',
       'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies',
       'Paperless Billing', 'Monthly Charges', 'Total Charges','Contract_Month-to-month',
       'Contract_One year', 'Contract_Two year',
       'Payment_Bank transfer (automatic)', 'Payment_Credit card (automatic)',
       'Payment_Electronic check', 'Payment_Mailed check', 'Internet_DSL',
       'Internet_Fiber optic', 'Internet_No']
target='Churn Value'

X_train, X_test, y_train, y_test = train_test_split(feature_eng(swan)[feature_cols], swan[target], test_size = 0.2, random_state = 42)


smote=SMOTE()
X_train, y_train = smote.fit_resample(X_train,y_train)

In [34]:
len(X_train)==len(y_train)

True

## Model Fit and Evaluation

We are going to consider three different classification models for predicting churn risk: Logistic Regression, Random Forest Classifier and Extra Trees Classifier. First of all, we are going to create baseline models for each of the three, using all of the available variables and limited tuning of the hyper parameters. Based on the performance of these, we can decide which model we want to use.

In [198]:
threshold=0.75

In [200]:
lr = LogisticRegression(max_iter=1000, random_state=10, class_weight='balanced')
lr.fit(X_train, y_train)

lr_train_prob=lr.predict_proba(X_train)[:,1]
lr_train_pred=np.where(lr_train_prob>threshold,1,0)
print(metrics.classification_report(lr_train_pred, y_train))
print("")
lr_test_prob=lr.predict_proba(X_test)[:,1]
lr_test_pred=np.where(lr_test_prob>threshold,1,0)
print(metrics.classification_report(lr_test_pred, y_test))

              precision    recall  f1-score   support

           0       0.96      0.76      0.85      5254
           1       0.69      0.94      0.80      3048

    accuracy                           0.83      8302
   macro avg       0.83      0.85      0.82      8302
weighted avg       0.86      0.83      0.83      8302


              precision    recall  f1-score   support

           0       0.97      0.78      0.87      1248
           1       0.32      0.79      0.45       159

    accuracy                           0.78      1407
   macro avg       0.64      0.78      0.66      1407
weighted avg       0.89      0.78      0.82      1407



In [201]:
rf = RandomForestClassifier(n_estimators=150, 
                            max_depth=5, 
                            min_samples_split=5, 
                            criterion='gini',
                           min_samples_leaf=5)
rf.fit(X_train,y_train)
rf_train_prob=rf.predict_proba(X_train)[:,1]
rf_train_pred= np.where(rf_train_prob>threshold, 1, 0)
print(metrics.classification_report(rf_train_pred, y_train))

rf_test_prob=rf.predict_proba(X_test)[:,1]
rf_test_pred=np.where(rf_test_prob>threshold,1,0)
print(metrics.classification_report(rf_test_pred, y_test))

              precision    recall  f1-score   support

           0       0.95      0.64      0.77      6169
           1       0.47      0.91      0.62      2133

    accuracy                           0.71      8302
   macro avg       0.71      0.78      0.69      8302
weighted avg       0.83      0.71      0.73      8302

              precision    recall  f1-score   support

           0       0.95      0.80      0.87      1205
           1       0.38      0.74      0.50       202

    accuracy                           0.79      1407
   macro avg       0.66      0.77      0.68      1407
weighted avg       0.87      0.79      0.81      1407



In [203]:
et = ExtraTreesClassifier(n_estimators=150, 
                          max_depth=4, 
                          min_samples_split=5)
et.fit(X_train,y_train)
et_train_prob=et.predict_proba(X_train)[:,1]
et_train_pred= np.where(et_train_prob>threshold, 1, 0)
print(metrics.classification_report(et_train_pred, y_train))
print("")
et_test_prob=et.predict_proba(X_test)[:,1]
et_test_pred= np.where(et_test_prob>threshold, 1, 0)
print(metrics.classification_report(et_test_pred, y_test))

              precision    recall  f1-score   support

           0       0.96      0.61      0.75      6482
           1       0.40      0.91      0.55      1820

    accuracy                           0.68      8302
   macro avg       0.68      0.76      0.65      8302
weighted avg       0.84      0.68      0.71      8302


              precision    recall  f1-score   support

           0       0.96      0.78      0.86      1252
           1       0.29      0.74      0.42       155

    accuracy                           0.77      1407
   macro avg       0.63      0.76      0.64      1407
weighted avg       0.89      0.77      0.81      1407



This is a comparison of three different models, with all of the variables 

## Optimisation of the Model

We want to select parameters and variables that will give us the most robust fit. To do this, we list the feature importance of the variables and grid search using a grid search.

In [52]:
feature_eng(swan).columns

Index(['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code',
       'Lat Long', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Online Security', 'Online Backup',
       'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies',
       'Paperless Billing', 'Monthly Charges', 'Total Charges', 'Churn Label',
       'Churn Value', 'Churn Reason', 'Contract_Month-to-month',
       'Contract_One year', 'Contract_Two year',
       'Payment_Bank transfer (automatic)', 'Payment_Credit card (automatic)',
       'Payment_Electronic check', 'Payment_Mailed check', 'Internet_DSL',
       'Internet_Fiber optic', 'Internet_No'],
      dtype='object')

In [204]:
importance=pd.DataFrame(list(zip(feature_cols, list(rf.feature_importances_))))\
    .sort_values(by=[1], ascending= False)\
    .rename({0:'Variable',1:'Importance'},axis=1)
importance

Unnamed: 0,Variable,Importance
16,Contract_Month-to-month,0.139912
3,Dependents,0.129711
18,Contract_Two year,0.129507
4,Tenure Months,0.092592
24,Internet_Fiber optic,0.068953
7,Online Security,0.068279
17,Contract_One year,0.054891
25,Internet_No,0.054161
10,Tech Support,0.053435
14,Monthly Charges,0.047787


In [210]:
selected_cols=list(importance.head(15).Variable)

In [211]:
rf=RandomForestClassifier(n_estimators=50, class_weight= 'balanced')
rf_params = {
    'max_depth': [2,4,6,8,10],
    'min_samples_split': [2, 4,6,8,10]
    
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5, verbose=2)
gs.fit(X_train[selected_cols], y_train)
print(gs.best_score_)
gs.best_params_

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.1s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.1s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.1s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.1s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.1s
[CV] END ...................max_depth=2, min_samples_split=4; total time=   0.1s
[CV] END ...................max_depth=2, min_samples_split=4; total time=   0.1s
[CV] END ...................max_depth=2, min_samples_split=4; total time=   0.1s
[CV] END ...................max_depth=2, min_samples_split=4; total time=   0.1s
[CV] END ...................max_depth=2, min_samples_split=4; total time=   0.1s
[CV] END ...................max_depth=2, min_samples_split=6; total time=   0.1s
[CV] END ...................max_depth=2, min_sa

{'max_depth': 10, 'min_samples_split': 6}

In [212]:
threshold=0.75
rf = RandomForestClassifier(n_estimators=250, 
                            max_depth=10, 
                            min_samples_split=6, 
                            criterion='gini',
                            min_samples_leaf=10,
                            class_weight='balanced'
                           )
rf.fit(X_train[selected_cols],y_train)
rf_train_prob=rf.predict_proba(X_train[selected_cols])[:,1]
rf_train_pred= np.where(rf_train_prob>threshold, 1, 0)
print(metrics.classification_report(rf_train_pred, y_train))
print("")
rf_test_prob=rf.predict_proba(X_test[selected_cols])[:,1]
rf_test_pred=np.where(rf_test_prob>threshold,1,0)
print(metrics.classification_report(rf_test_pred, y_test))

              precision    recall  f1-score   support

           0       0.93      0.74      0.83      5158
           1       0.68      0.90      0.78      3144

    accuracy                           0.80      8302
   macro avg       0.80      0.82      0.80      8302
weighted avg       0.83      0.80      0.81      8302


              precision    recall  f1-score   support

           0       0.93      0.82      0.87      1147
           1       0.47      0.72      0.57       260

    accuracy                           0.80      1407
   macro avg       0.70      0.77      0.72      1407
weighted avg       0.84      0.80      0.81      1407



## Evaluating Churn Risk

In [151]:
swan1=swan.copy()

swan1['probability_churn']=rf.predict_proba(feature_eng(swan)[feature_cols])[:,1]
top_500_risk=swan1[swan1['Churn Label']=='No']\
        .sort_values(by=['probability_churn'],ascending=False)\
        [['CustomerID','probability_churn']]\
        .head(500)\
        .reset_index(drop=True)
top_500_risk


Unnamed: 0,CustomerID,probability_churn
0,7439-DKZTW,0.847534
1,1452-VOQCH,0.847534
2,7577-SWIFR,0.846510
3,7465-ZZRVX,0.845215
4,5542-TBBWB,0.845215
...,...,...
495,7693-QPEFS,0.678300
496,6968-GMKPR,0.677971
497,4238-JSSWH,0.677507
498,3737-XBQDD,0.676931


In [152]:
churn_risk=swan1[swan1['Churn Label']=='No']\
        [['CustomerID','probability_churn']]\
        .reset_index(drop=True)
churn_risk

Unnamed: 0,CustomerID,probability_churn
0,7590-VHVEG,0.671286
1,5575-GNVDE,0.205171
2,7795-CFOCW,0.158427
3,1452-KIOVK,0.356614
4,6713-OKOMC,0.508626
...,...,...
5158,2569-WGERO,0.057123
5159,6840-RESVB,0.079232
5160,2234-XADUH,0.173582
5161,4801-JZAZL,0.293452
