## Predicting Churn Rate in Telecom Industry

### Importing required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# BaggingClassifier, AdaBoostClassifier, 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")

### Reading the data

In [None]:
tc=pd.read_csv('Telco-Customer-Churn.csv',na_values=[' ','/','?','$'])

In [2]:
pd.options.display.max_columns=None
#  option to view all the columns without missing any, in the outputs

NameError: name 'pd' is not defined

In [None]:
tc.sample(5)

In [3]:
tc[tc['InternetService']=='No']

NameError: name 'tc' is not defined

In [None]:
#replace 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in replace_cols : 
    tc[i]  = tc[i].replace({'No internet service' : 'No'})

In [None]:
tc['SeniorCitizen']=tc['SeniorCitizen'].replace([0,1],['No','Yes'])
# 'SeniorCitizen' has only 1's and 0's and it is a categorical column so converting it into object

In [None]:
tc.info()

### Exploratory Data Analysis

In [None]:
del tc['customerID']
#  customerID is the unqiue column and we cant get any insights from this, and hence it's removed

In [None]:
tc.info()

In [None]:
for i in tc.columns:
    print(i,':',tc[i].nunique())

In [4]:
tc.describe()

NameError: name 'tc' is not defined

In [None]:
df=tc[tc['TotalCharges'].isnull()]
df

In [None]:
con=(tc['Dependents']=='Yes') 
imp_mean=tc[con]
imp_mean.head()

In [None]:
male,female=imp_mean['TotalCharges'].groupby(tc['gender']).mean()
(male,female)

In [None]:
con1=(tc['gender']=='Male')& (tc['tenure']==0)
con2=(tc['TotalCharges'].isnull())
con3=con1&con2
tc[con3]=tc[con3].fillna(male)

In [None]:
con1=(tc['gender']=='Female')& (tc['tenure']==0)
con2=(tc['TotalCharges'].isnull())
con3=con1&con2
tc[con3]=tc[con3].fillna(female)

In [None]:
tc.info()

In [None]:
tc.describe()

In [None]:
catcols=tc.select_dtypes('object')
catcols

In [6]:
# Taking all columns whose categories are more than 2 and check if they are significantly different
l=[]
for i in catcols.columns:
    if catcols[i].nunique()>2:
        l.append(i)
print(l)

NameError: name 'catcols' is not defined

In [None]:
# Since there are more two categories we perform chi2_contingency
# we consider those columns whose p_val is <0.05
from scipy.stats import chi2_contingency
f=[]
for i in l:
    cc=chi2_contingency(pd.crosstab(catcols['Churn'],catcols[i]))
    if(cc[1]<=0.05):
        f.append(i)
print(f)

In [None]:
# Taking all columns whose categories are less than 2 and check if they are significantly different
k=[]
for i in catcols.columns:
    if catcols[i].nunique()==2:
        k.append(i)
k.pop(-1) # since 'Churn' is our predictive varibale we remove it from here, to compare every categorical value with 'Churn'
print(k)

In [None]:
# Since there are two categories we perform proportions_ztest
# we consider those columns whose p_val is <0.05
from statsmodels.stats.proportion import proportions_ztest
g=[]
for i in k:
    c=pd.crosstab(tc['Churn'],tc[i])
    x1=c.iloc[1,1]
    x2=c.iloc[1,0]
    n1=c.iloc[:,1].sum()
    n2=c.iloc[:,0].sum()
    cc=proportions_ztest([x1,x2],[n1,n2])
    if(cc[1]<=0.05):
        g.append(i)
print(g)

In [7]:
# List of categorical whose p<0.05 after proportion_ztest and chi2_contingency test
f_list=f+g
print(f_list,'\n')
print('Total number of required columns:',len(f_list))

NameError: name 'f' is not defined

In [None]:
tc['Churn'].value_counts(),

In [None]:
f_list

In [None]:
plt.pie(tc['Churn'].value_counts(),explode=(0,0.09),autopct='%1.1f%%',labels=('No','Yes'),shadow=True)
plt.savefig('Churn pct')
plt.show()

In [None]:
fig,axes=plt.subplots(4,4,figsize=(20,20))
axes=axes.flatten()
for i in range(0,len(f_list)):
    sns.countplot(tc[f_list[i]],hue=tc['Churn'],ax=axes[i])
plt.tight_layout()
plt.savefig('Attribute wise comparison with Churn')
plt.show()

In [None]:
# creating one hot encoding for the selected columns
tc_dum=pd.get_dummies(tc[f_list])
tc_dum.head()
print(tc_dum.shape)

In [None]:
num=tc.select_dtypes(['int64','float'])
num
X=pd.concat([tc_dum,num],axis=1)
y=tc['Churn']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 7,test_size=0.30)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg_result=logreg.fit(X_train,y_train)
print(logreg_result.score(X_train,y_train))
print("Training set score: {:.3f}".format(logreg_result.score(X_train,y_train)))
print("Test score: {:.3f}".format(logreg_result.score(X_test,y_test)))

In [None]:
logreg_result

In [None]:
y_pred  = logreg.predict(X_test)
y_proba = logreg.predict_proba(X_test)
y_proba
# y_proba consists of p and (1-p) values, but we use (1-p) values for roc curve

In [None]:
# sns.scatterplot(X_test,logreg.predict(X_test))

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
y_test1=y_test.replace(['Yes','No'],[True,False])
y_train1=y_train.replace(['Yes','No'],[True,False])
fpr, tpr, thresholds = roc_curve(y_test1, y_proba[:,1])
areauc=auc(fpr,tpr)
areauc

In [None]:
plt.plot(fpr,tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Logistic Regression')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

In [None]:
from statsmodels.tools import add_constant as add_constant
import statsmodels.api as sm
X_train1=sm.add_constant(X_train)
logit_model=sm.Logit(y_train1,X_train1)
result=logit_model.fit()
print(result.summary2())

In [None]:
# Recursive Feature Elimination
#  To find the best columns suitable for the analysis

In [None]:
from sklearn.feature_selection import RFE

logit = LogisticRegression()

rfe = RFE(logit,10)
rfe = rfe.fit(X_train,y_train)

print(rfe.support_)
print(rfe.ranking_)
#identified columns Recursive Feature Elimination

In [None]:
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" :X_train.columns ,
                       "ranking" : rfe.ranking_,
                      })
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()
cols

In [None]:
# From the RFE we filter the best columns where support is True
X_train_rfe=X_train[cols]
X_test_rfe=X_test[cols]

### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF_model=RandomForestClassifier(random_state=10)
RF_results=RF_model.fit(X_train_rfe,y_train)

In [None]:
y_pred_test = RF_model.predict(X_test_rfe)
y_pred_train = RF_model.predict(X_train_rfe)

In [None]:
print(classification_report(y_train,y_pred_train))
print('Train Accuracy Score for Random Forest 1st attempt',accuracy_score(y_train,y_pred_train))

In [None]:
print(classification_report(y_test,y_pred_test))
print('Test Accuracy Score for Random Forest 1st attempt',accuracy_score(y_test,y_pred_test))

In [None]:
from sklearn.model_selection import GridSearchCV
RF_model=RandomForestClassifier(n_estimators=10,random_state=10)
param_grid_rf={'n_estimators':[11,12,13,9],'max_features':['auto','sqrt'],}
clf=GridSearchCV(RF_model,param_grid_rf,cv=2)
clf.fit(X_train_rfe,y_train)

In [None]:
clf.best_params_

In [None]:
RF_best_model=RandomForestClassifier(n_estimators=9,random_state=10)
RF_best_model.fit(X_train_rfe,y_train)

In [None]:
y_train_pred=clf.predict(X_train_rfe)
y_test_pred=clf.predict(X_test_rfe)
(y_train_pred,y_test_pred)

In [None]:

print(classification_report(y_train,y_train_pred))
print('Train Accuracy Random Forest after Grid Search CV :',accuracy_score(y_train,y_train_pred))

In [None]:
print(classification_report(y_test,y_test_pred))
print('Test Accuracy Random Forest after Grid Search CV :',accuracy_score(y_test,y_test_pred))

In [None]:
y_test_pred=pd.DataFrame(RF_best_model.predict(X_test_rfe))
y_proba_rf = RF_best_model.predict_proba(X_test_rfe)
# y_proba_rf

In [None]:
y_test1=y_test.replace(['Yes','No'],[True,False])
y_train1=y_train.replace(['Yes','No'],[True,False])
fpr1, tpr1, thresholds = roc_curve(y_test1, y_proba_rf[:,1])
(fpr1, tpr1)
# print(len(y_proba_rf[:,1]))

In [None]:
plt.plot(fpr1,tpr1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Logistic Regression')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

### Bagging

In [None]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
bag_model = BaggingClassifier(DecisionTreeClassifier(random_state=1))
bag_model.fit(X_train_rfe,y_train)

In [None]:
bag_pred = bag_model.predict(X_test_rfe)
accuracy_score(y_test, bag_pred)

In [None]:
bag_cm = confusion_matrix(y_test, bag_pred)
sns.heatmap(bag_cm,annot=True,fmt='d',cmap='Blues_r')
plt.show()

In [None]:
print(classification_report(y_test,bag_pred))

In [None]:
y_pred_bag=bag_model.predict(X_test_rfe)
y_pred_proba_bag=bag_model.predict_proba(X_test_rfe)
fpr3, tpr3, thresholds = roc_curve(y_test1, y_pred_proba_bag[:,1])
(fpr3, tpr3)

In [None]:
plt.plot(fpr3,tpr3)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Logistic Regression')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn=KNeighborsClassifier()
knn.fit(X_train_rfe,y_train)

In [None]:
accuracy_score(y_pred_knn,y_test)

In [None]:
y_pred_knn=knn.predict(X_test_rfe)
y_pred_proba_knn=knn.predict_proba(X_test_rfe)
fpr2, tpr2, thresholds = roc_curve(y_test1, y_pred_proba_knn[:,1])

In [None]:
plt.plot(fpr2,tpr2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Logistic Regression')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

### Gradiant Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(learning_rate=0.01,random_state=1)
gb_model.fit(X_train_rfe,y_train)

In [None]:
gb_pred = gb_model.predict(X_test_rfe)
accuracy_score(y_test, gb_pred)

In [None]:
y_pred_gb=gb_model.predict(X_test_rfe)
y_pred_proba_gb=gb_model.predict_proba(X_test_rfe)
fpr4, tpr4, thresholds = roc_curve(y_test1, y_pred_proba_gb[:,1])

In [None]:
# ROC Curves for Logistic Regression and Random Forest
plt.figure(figsize=(10,10))
plt.plot(fpr,tpr)# log reg
plt.plot(fpr1,tpr1)# rand for
plt.plot(fpr2,tpr2)# KNN
plt.plot(fpr3,tpr3)# Bagging
plt.plot(fpr4,tpr4)# Gradiant Boosting
plt.grid()
plt.title('Algorithm Wise ROC Comparison')
plt.legend(['Logistic Regression','Random Forest','KNN','Bagging','Gradiant Boosting'])
plt.savefig('ROC Curves')

In [None]:
models = [('Logistic Regression', LogisticRegression()),  
          ('Random Forest', RandomForestClassifier()), 
          ('KNN', KNeighborsClassifier()), 
          ('Bagging',BaggingClassifier()),
          ('Gradient Boosting', GradientBoostingClassifier())]

In [None]:
seed = 7
results = []
names = []
from sklearn.model_selection import KFold, cross_val_score
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X_test_rfe, y_test, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
plt.boxplot(results)
plt.title('Algorithm Wise Test Accuracy Comparison')
ax.set_xticklabels(names,rotation=45)
plt.ylabel('')
plt.savefig('Algorithm Wise Test Accuracy Comparison')
plt.show()

In [None]:
seed_train = 7
results_train = []
names_train = []
from sklearn.model_selection import KFold, cross_val_score
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed_train)
    cv_results = cross_val_score(model, X_train_rfe, y_train, cv=kfold, scoring='accuracy')
    results_train.append(cv_results)
    names_train.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
plt.figure(figsize=(12,6))
fig, ax = plt.subplots(figsize=(12,6))
plt.boxplot(results_train)
plt.title('Algorithm Wise Train Accuracy Comparison')
ax.set_xticklabels(names_train,rotation=45)
plt.ylabel('')
plt.savefig('Algorithm Wise Train Accuracy Comparison')
plt.show()

In [8]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [9]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]

NameError: name 'pd' is not defined

In [None]:
X_train.values

In [None]:
vif