# Modelling Using Random Forest

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
tc_data = pd.read_csv("/home2/MyNotes/Python/DataSets/CustomerChurn/telecomChurn.csv",
                             sep=",",
                             header=0)
tc_data.head(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,TechSupportYes,StreamingTVNo internet service,StreamingTVYes,StreamingMoviesNo internet service,StreamingMoviesYes,ContractOne year,ContractTwo year,PaymentMethodCredit card (automatic),PaymentMethodElectronic check,PaymentMethodMailed check
0,0,0,1,1,45,1,1,81.0,3533.6,No,...,1,0,1,0,1,0,1,0,0,1
1,1,0,0,0,4,1,1,81.0,340.85,Yes,...,0,0,1,0,0,0,0,0,1,0
2,0,0,1,1,18,1,1,20.25,401.95,No,...,0,1,0,1,0,0,0,0,0,1
3,0,0,1,0,53,1,1,93.45,4872.2,No,...,1,0,1,0,1,1,0,0,1,0
4,0,1,1,0,3,1,1,74.6,239.05,No,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Looks like these are new customers as the tenure is zero. We can update the MonthlyCharges into TotalCharges
tc_data["TotalCharges"] = tc_data["TotalCharges"].fillna(value=tc_data["MonthlyCharges"])

In [3]:
tc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4930 entries, 0 to 4929
Data columns (total 31 columns):
gender                                  4930 non-null int64
SeniorCitizen                           4930 non-null int64
Partner                                 4930 non-null int64
Dependents                              4930 non-null int64
tenure                                  4930 non-null int64
PhoneService                            4930 non-null int64
PaperlessBilling                        4930 non-null int64
MonthlyCharges                          4930 non-null float64
TotalCharges                            4930 non-null float64
Churn                                   4930 non-null object
MultipleLinesNo phone service           4930 non-null int64
MultipleLinesYes                        4930 non-null int64
InternetServiceFiber optic              4930 non-null int64
InternetServiceNo                       4930 non-null int64
OnlineSecurityNo internet service       4930 non

In [4]:
# Looks like these are new customers as the tenure is zero. We can update the MonthlyCharges into TotalCharges
tc_data["TotalCharges"] = tc_data["TotalCharges"].fillna(value=tc_data["MonthlyCharges"])

In [5]:
tc_data.head(10)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,TechSupportYes,StreamingTVNo internet service,StreamingTVYes,StreamingMoviesNo internet service,StreamingMoviesYes,ContractOne year,ContractTwo year,PaymentMethodCredit card (automatic),PaymentMethodElectronic check,PaymentMethodMailed check
0,0,0,1,1,45,1,1,81.0,3533.6,No,...,1,0,1,0,1,0,1,0,0,1
1,1,0,0,0,4,1,1,81.0,340.85,Yes,...,0,0,1,0,0,0,0,0,1,0
2,0,0,1,1,18,1,1,20.25,401.95,No,...,0,1,0,1,0,0,0,0,0,1
3,0,0,1,0,53,1,1,93.45,4872.2,No,...,1,0,1,0,1,1,0,0,1,0
4,0,1,1,0,3,1,1,74.6,239.05,No,...,0,0,0,0,0,0,0,0,0,0
5,1,0,1,1,28,0,0,50.8,1386.8,No,...,1,0,1,0,0,1,0,0,0,1
6,0,0,0,1,8,1,0,19.95,170.9,No,...,0,1,0,1,0,0,0,0,0,1
7,0,0,1,0,47,1,0,19.65,921.55,No,...,0,1,0,1,0,0,1,0,0,0
8,0,0,0,0,5,0,1,50.95,229.4,No,...,0,0,1,0,1,0,0,1,0,0
9,0,0,1,1,21,1,1,74.05,1565.7,Yes,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Treating categorical variables
Churn_f = pd.get_dummies(tc_data["Churn"],drop_first=True,prefix="Churn")
tc_data_clean = pd.concat([tc_data,Churn_f],axis=1)

In [8]:
tc_data_features = tc_data_clean.drop("Churn",axis=1)
tc_data_target = tc_data_clean["Churn_Yes"]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tc_data_features,tc_data_target, test_size=0.30, random_state=101)

In [10]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
predictions = rfc.predict(X_test)

In [13]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1096
          1       1.00      1.00      1.00       383

avg / total       1.00      1.00      1.00      1479

[[1096    0]
 [   0  383]]


In [23]:
# Using grid search
# We can have various values of C, max_iter,tol, solver
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators':[10,50,100,250],'min_samples_leaf':[1,2,3,4],'min_samples_split':[2,3,4]} 

In [24]:
grid = GridSearchCV(RandomForestClassifier(),param_grid,refit=True,verbose=3)

In [25]:
grid.fit(X_train,y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=10 ........
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=10, score=0.9930495221546481, total=   0.0s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=10 ........
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=10, score=0.996524761077324, total=   0.0s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=10 ........
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=10, score=1.0, total=   0.0s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=50 ........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=50, score=1.0, total=   0.1s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=50 ........
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=50, score=1.0, total=   0.1s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=50 ........
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=50, score=1.0, total=   0.1s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=100 .......
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=100, score=1.0, total=   0.2s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=100 .......
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=100, score=1.0, total=   0.2s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=100 .......
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=100, score=1.0, total=   0.2s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=250 .......
[CV]  min_samples_leaf=1, min_samples

[CV]  min_samples_leaf=2, min_samples_split=3, n_estimators=50, score=0.9973890339425587, total=   0.1s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=100 .......
[CV]  min_samples_leaf=2, min_samples_split=3, n_estimators=100, score=1.0, total=   0.2s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=100 .......
[CV]  min_samples_leaf=2, min_samples_split=3, n_estimators=100, score=1.0, total=   0.2s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=100 .......
[CV]  min_samples_leaf=2, min_samples_split=3, n_estimators=100, score=0.9973890339425587, total=   0.2s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=250 .......
[CV]  min_samples_leaf=2, min_samples_split=3, n_estimators=250, score=1.0, total=   0.5s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=250 .......
[CV]  min_samples_leaf=2, min_samples_split=3, n_estimators=250, score=1.0, total=   0.5s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=250 .......
[CV] 

[CV]  min_samples_leaf=3, min_samples_split=4, n_estimators=100, score=1.0, total=   0.2s
[CV] min_samples_leaf=3, min_samples_split=4, n_estimators=100 .......
[CV]  min_samples_leaf=3, min_samples_split=4, n_estimators=100, score=1.0, total=   0.2s
[CV] min_samples_leaf=3, min_samples_split=4, n_estimators=100 .......
[CV]  min_samples_leaf=3, min_samples_split=4, n_estimators=100, score=1.0, total=   0.2s
[CV] min_samples_leaf=3, min_samples_split=4, n_estimators=250 .......
[CV]  min_samples_leaf=3, min_samples_split=4, n_estimators=250, score=0.9991311902693311, total=   0.5s
[CV] min_samples_leaf=3, min_samples_split=4, n_estimators=250 .......
[CV]  min_samples_leaf=3, min_samples_split=4, n_estimators=250, score=1.0, total=   0.5s
[CV] min_samples_leaf=3, min_samples_split=4, n_estimators=250 .......
[CV]  min_samples_leaf=3, min_samples_split=4, n_estimators=250, score=0.999129677980853, total=   0.5s
[CV] min_samples_leaf=4, min_samples_split=2, n_estimators=10 ........
[CV] 

[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:   35.3s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100, 250], 'min_samples_leaf': [1, 2, 3, 4], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [26]:
grid.best_params_

{'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}

In [27]:
grid.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
predictions = grid.predict(X_test)

In [29]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(predictions,y_test))
print(confusion_matrix(predictions,y_test))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1096
          1       1.00      1.00      1.00       383

avg / total       1.00      1.00      1.00      1479

[[1096    0]
 [   0  383]]
