## Swan Consulting Feature engineering and Model Creation 

In this workbook, we will explore and engineer the features from our Swan Consulting Customer Dataset, then move onto create a DecisionTreeClassifier and evaluate it's performance

In [5]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

In [6]:
def apr(y_pred, y_real):
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)
    
    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

## Data Import and Exploration

In [7]:
swan_og = pd.read_excel("/Users/zachgolant/Desktop/DF/1 - Project Data.xlsx")
swan = swan.copy()
swan.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/zachgolant/Desktop/DF/1 - Project Data.xlsx'

In [None]:
swan.info()

In [None]:
swan.loc[:,'Tenure Months':'Total Charges']

## Feature Engineering

In [None]:
def yes_no(column):
    x=column
    if x=='Yes':
        return 1
    else:
        return 0
        
products = ['Product dissatisfaction', 'Network reliability', 'Lack of self-service on Website', 'Limited range of services', 'Long distance charges', 'Lack of affordable download/upload speed' ]
swan_services = ['Attitude of support person', 'Attitude of service provider',   'Price too high', 'Service dissatisfaction',  'Extra data charges', 'Poor expertise of phone support', 'Poor expertise of online support']
competitor_services = ['Competitor offered higher download speeds', 'Competitor offered more data', 'Competitor made better offer', 'Competitor had better devices']

def churn_reason(reason):
    if reason in products:
        return 'Issue with products' 
    elif reason in swan_services:
        return 'Issue with Swan Services' 
    elif reason in competitor_services:
        return 'Competitor offered better services'
    else:
        return 'Other'

swan=swan[swan['Total Charges']!=' '] #Some customers are in the database but never actually used the product, also have Tenure=0

In [None]:
feature_eng(swan).columns

In [None]:
def feature_eng(x):
    df=x.copy()
    yes_no_columns=['Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines',
                    'Online Security', 'Online Backup', 'Device Protection', 
                    'Tech Support', 'Streaming TV', 'Streaming Movies', 'Paperless Billing']
    for column in yes_no_columns:
        df[column]=df[column].apply(yes_no) 
    encoding_columns=['Contract','Payment Method', 'Internet Service']
    encoding_prefix=['Contract','Payment', 'Internet']
    df= pd.get_dummies(data=df, 
                       columns=encoding_columns, 
                       prefix=encoding_prefix, 
                       dtype=int)
    df.Gender = df.Gender.map({'Male':0, 'Female':1})

    return df

In [None]:
feature_eng(swan).info()

In [None]:
X_train.columns

In [None]:
pd.DataFrame(list(zip(feature_cols, list(rf.feature_importances_)))).sort_values(by=[1], ascending= False)

In [None]:
feature_cols=[
        'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Online Security', 'Online Backup',
       'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies',
       'Paperless Billing', 'Monthly Charges', 'Total Charges','Contract_Month-to-month',
       'Contract_One year', 'Contract_Two year',
       'Payment_Bank transfer (automatic)', 'Payment_Credit card (automatic)',
       'Payment_Electronic check', 'Payment_Mailed check', 'Internet_DSL',
       'Internet_Fiber optic', 'Internet_No']
target='Churn Value'

X_train, X_test, y_train, y_test = train_test_split(feature_eng(swan)[feature_cols], swan[target], test_size = 0.2, random_state = 42)

In [None]:
X_train.info()

## Model Fit and Evaluation

We are going to consider three different classification models for predicting churn risk: Logistic Regression, Random Forest Classifier and Extra Trees Classifier.

In [None]:
rf = RandomForestClassifier(n_estimators=150, max_depth=5, min_samples_split=2)
rf.fit(X_train,y_train)
rf_train_pred=rf.predict(X_train)
apr(rf_train_pred, y_train)
rf_test_pred=rf.predict(X_test)
apr(rf_test_pred, y_test)

In [None]:
et = ExtraTreesClassifier(n_estimators=150, max_depth=5, min_samples_split=3)
et.fit(X_train,y_train)
et_train_pred=et.predict(X_train)
apr(et_train_pred, y_train)
et_pred=et.predict(X_test)
apr(et_pred, y_test)

As we can see, the best baseline model seems to be Random Forests. Now we can move forwards optimising the model by tuning the hyper parameters and selecting variables.

## Grid Search Optimisation

In [None]:
rf=RandomForestClassifier(n_estimators=50, class_weight= None)
rf_params = {
    'max_depth': [2,4,6,8,10],
    'min_samples_split': [2, 4,6,8,10]
    
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5, verbose=1)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
rf = RandomForestClassifier(n_estimators=250, 
                            max_depth=6, 
                            min_samples_split=6, 
                            class_weight=None)
rf.fit(X_train,y_train)
rf_train_pred=rf.predict(X_train)
apr(rf_train_pred, y_train)
rf_test_pred=rf.predict(X_test)
apr(rf_test_pred, y_test)

In [None]:
pd.DataFrame(list(zip(feature_cols, list(rf.feature_importances_)))).sort_values(by=[1])

In [None]:
swan1.columns

## Evaluating Churn Risk

In [None]:
swan1=swan.copy()

swan1['probability_churn']=rf.predict_proba(feature_eng(swan)[feature_cols])[:,1]
top_500_risk=swan1[swan1['Churn Label']=='No']\
        .sort_values(by=['probability_churn'],ascending=False)\
        [['CustomerID','probability_churn']]\
        .head(500)\
        .reset_index(drop=True)
top_500_risk


In [None]:
churn_risk=swan1[['CustomerID','probability_churn']]
churn_risk