In [None]:
#Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import scale
%matplotlib inline
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [None]:
# read daṇta
churn = pd.read_csv("telecom_churn_data.csv")

In [None]:
churn.shape

In [None]:
churn.info()

In [None]:
churn.describe()

In [None]:

#list of columns
pd.set_option('display.max_columns', 500)
pd.DataFrame(churn.columns)


In [None]:
len(churn.mobile_number.unique())

In [None]:
# feature type summary
churn.info(verbose=1)

In [None]:
# look at data statistics
churn.describe(include='all')

###### Data Cleaning

In [None]:
pd.set_option("display.max_rows", None)
(churn.isna().sum()/churn.shape[0])*100



In [None]:
pd.set_option("display.max_rows", None)
(churn.isna().sum()/churn.shape[0])*100

In [None]:
# Code to convert null values to 0 for all numeric columns
all_columns=churn.columns

date_columns=[x for x in all_columns if 'date' in x]

# Converting dtype of date columns to datetime
for col in date_columns:
    churn[col] = pd.to_datetime(churn[col], format='%m/%d/%Y')
    
numeric_columns=[x for x in all_columns if x not in date_columns]
churn[numeric_columns]=churn[numeric_columns].fillna(0)

In [None]:
churn = churn.dropna(thresh=churn.shape[0]*0.6,how='all',axis=1)
churn.reset_index(inplace=True,drop=True)

In [None]:
(churn.isna().sum()/churn.shape[0])*100

##### There are few columns more than  75 percent of missing values we can drop those columns

In [None]:
churn.shape

#### date_of_last_rech_data_6, date_of_last_rech_data_7, date_of_last_rech_data_8 , date_of_last_rech_data_9 removed by using this iteration

##### impute date with medain :last_date_of_month_7,last_date_of_month_8, last_date_of_month_9

In [None]:
###imputing last date of month with actual data
from datetime import date,datetime
churn['last_date_of_month_7']=churn['last_date_of_month_7'].fillna('7/31/2014')
churn['last_date_of_month_8']=churn['last_date_of_month_8'].fillna('8/31/2014')
churn['last_date_of_month_9']=churn['last_date_of_month_9'].fillna('9/30/2014')

In [None]:
##imputing with mode
churn['date_of_last_rech_6'] = churn['date_of_last_rech_6'].fillna(churn['date_of_last_rech_6'].mode()[0])
churn['date_of_last_rech_7'] =churn['date_of_last_rech_7'].fillna(churn['date_of_last_rech_7'].mode()[0])
churn['date_of_last_rech_8']=churn['date_of_last_rech_8'].fillna(churn['date_of_last_rech_6'].mode()[0])
churn['date_of_last_rech_9']= churn['date_of_last_rech_9'].fillna(churn['date_of_last_rech_9'].mode()[0])


In [None]:
(churn.isna().sum()/churn.shape[0])*100

In [None]:
churn.shape

### 2) Derive New Features

In [None]:
# Let us first extract list of columns containing recharge amount
amt_recharge_columns =  churn.columns[churn.columns.str.contains('rech_amt|rech_data')]
print(amt_recharge_columns)

In [None]:
pd.DataFrame(churn.columns)

In [None]:
# look at initial rows of the data
churn.head(10)

In [None]:
# let's adding new column total recharge amount for data: total_rech_amt_data for calculating High Value customer process
churn['total_rech_amt_data_6'] = churn.av_rech_amt_data_6 * churn.total_rech_data_6
churn['total_rech_amt_data_7'] = churn.av_rech_amt_data_7 * churn.total_rech_data_7
churn['total_rech_amt_data_8'] = churn.av_rech_amt_data_8 * churn.total_rech_data_8

In [None]:
churn['total_avg_rech_amnt_6_7_GPhase'] = (churn.total_rech_amt_6 + churn.total_rech_amt_data_6 \
                                               + churn.total_rech_amt_7+ churn.total_rech_amt_data_7)/2

In [None]:
# create a filter for values greater than 70th percentile of total average recharge amount for good phase 
high_value_filter = churn.total_avg_rech_amnt_6_7_GPhase.quantile(0.7)

print('70 percentile of 6th and 7th months avg recharge amount: '+str(high_value_filter))

telecom_df_high_val_cust = churn[churn.total_avg_rech_amnt_6_7_GPhase > high_value_filter]
print('Dataframe Shape after Filtering High Value Customers: ' + str(telecom_df_high_val_cust.shape))

###### 3) Tag churners and remove attributes of the churn phase

In [None]:
telecom_df_high_val_cust['churn']=telecom_df_high_val_cust['total_ic_mou_9']+telecom_df_high_val_cust['total_og_mou_9']+telecom_df_high_val_cust['vol_2g_mb_9']+telecom_df_high_val_cust['vol_3g_mb_9']
telecom_df_high_val_cust['churn']=telecom_df_high_val_cust['churn'].apply(lambda x: 1 if x==0 else 0)

In [None]:
# let us check what's the % of churned customers
100*telecom_df_high_val_cust.churn.sum()/len(telecom_df_high_val_cust)

In [None]:
telecom_df_high_val_cust.churn.value_counts()

##### After tagging churners, remove all the attributes corresponding to the churn phase (all attributes having ‘ _9’, etc. in their names)

In [None]:
churn_month_columns =  telecom_df_high_val_cust.columns[telecom_df_high_val_cust.columns.str.contains('_9')]

In [None]:
# drop all columns corresponding to the churn phase
telecom_df_high_val_cust.drop(churn_month_columns,axis=1,inplace=True)

## EDA

In [None]:
print(any(telecom_df_high_val_cust['mobile_number'].duplicated()))

In [None]:

# Checking for outliers in the continuous variables
num_df = telecom_df_high_val_cust[[x for x in telecom_df_high_val_cust.columns if x not in date_columns]]

In [None]:
# Checking outliers at 25%,50%,75%,90%,95% and 99%
num_df.describe(percentiles=[0.02,.25,.5,.75,.90,.95,.99])

###### We can see there are outliers at some places and we can treat those by doing caping of columns. Also we can remove columns like circle_id,log_og_t20_mou , std_og_t2o_mou, loc_ic_t2o_mou , std_ic_t2o_mou_6 , std_ic_t2o_mou_7 , std_ic_t2o_mou_8

In [None]:

filt_df = telecom_df_high_val_cust[[x for x in telecom_df_high_val_cust.columns if x not in date_columns]]
low = .001
high = .999
quant_df = filt_df.quantile([low, high])
quant_df

In [None]:
filt_df = filt_df.apply(lambda x: x[(x>=quant_df.loc[low,x.name]) & 
                                    (x <= quant_df.loc[high,x.name])], axis=0)

In [None]:
#Merging with the leads dataframe
telecom_df_high_val_cust = pd.concat([telecom_df_high_val_cust.loc[:, ~telecom_df_high_val_cust.columns.isin([x for x in telecom_df_high_val_cust.columns if x not in date_columns])], filt_df], axis=1)

In [None]:

#Dropping NA values
telecom_df_high_val_cust.dropna(inplace=True)
telecom_df_high_val_cust.shape

In [None]:
telecom_df_high_val_cust.churn.value_counts()


In [None]:
# let us check what's the % of churned customers
100*telecom_df_high_val_cust.churn.sum()/len(telecom_df_high_val_cust)

In [None]:

# Let's see the correlation matrix 

telecom_df_high_val_cust.corr()

###### We can see here that there are variables that are highly corelated . PCA will take care of this collinearity

In [None]:
## Removing variables that are of no use , since those are not carrying much variance (Few mentioned above in describe like circle_id,log_og_t20_mou , std_og_t2o_mou, loc_ic_t2o_mou , std_ic_t2o_mou_6 , std_ic_t2o_mou_7 , std_ic_t2o_mou_8
##along with date columns)

telecom_df_high_val_cust=telecom_df_high_val_cust.drop(['circle_id','loc_og_t2o_mou','std_og_t2o_mou','loc_ic_t2o_mou','std_ic_t2o_mou_6','std_ic_t2o_mou_7','std_ic_t2o_mou_8','last_date_of_month_6','last_date_of_month_7','last_date_of_month_8'],1)


In [None]:
# create box plot for  6th, 7th and 8th month
def plot_box_chart(attribute):
    plt.figure(figsize=(20,16))
    df = telecom_df_high_val_cust
    plt.subplot(2,3,1)
    sns.boxplot(data=df, y=attribute+"_6",x="churn",hue="churn",
                showfliers=False,palette=("plasma"))
    plt.subplot(2,3,2)
    sns.boxplot(data=df, y=attribute+"_7",x="churn",hue="churn",
                showfliers=False,palette=("plasma"))
    plt.subplot(2,3,3)
    sns.boxplot(data=df, y=attribute+"_8",x="churn",hue="churn",
                showfliers=False,palette=("plasma"))
plt.show()

In [None]:
recharge_amnt_columns =  telecom_df_high_val_cust.columns[telecom_df_high_val_cust.columns.str.contains('rech_amt')]
recharge_amnt_columns.tolist()

In [None]:
# Ploting for total recharge amount:
plot_box_chart('total_rech_amt')

###### We can see a drop in the total recharge amount for churned customers in the 8th Month (Action Phase).



In [None]:
# Ploting for total recharge amount for data:
plot_box_chart('total_rech_amt_data')

###### We can see that there is a huge drop in total recharge amount for data in the 8th month (action phase) for churned customers.

In [None]:
# Ploting for maximum recharge amount for data:
plot_box_chart('max_rech_amt')

###### We can see that there is a huge drop in maximum recharge amount for data in the 8th month (action phase) for churned customers

In [None]:
# Ploting for Total recharge for Number:
plot_box_chart('total_rech_num')

###### We can see that there is a huge drop in total recharge number also in the 8th month (action phase) for churned customers.

###### As expected recharge related activities went down in 8th month for churned customers. Similar pattern we may expect in other recrhage related variables too


###### 2G and 3G Data related Attributes

In [None]:
usge_2g_and_3g = telecom_df_high_val_cust.columns[telecom_df_high_val_cust.columns.str.contains('2g|3g',regex=True)]

In [None]:
# Ploting for volume of 2G and 3G usage columns:
plot_box_chart('vol_2g_mb')

In [None]:
plot_box_chart('vol_3g_mb')

###### Similarly here also there is drop in 2G and 3G services also we can see that there proportion of non churn customer are using more internet services so we may say that non availability of better services may contributing to churning of customers

##### Average Reveneue Per user

In [None]:
# Checking columns for average revenue per user
arpu_cols = telecom_df_high_val_cust.columns[telecom_df_high_val_cust.columns.str.contains('arpu_')]

# Plotting arpu
plot_box_chart('arpu')

###### Similar pattern as expected , where we can see drop in arpu in 8th month for churned customer

######  Offnet usage 

In [None]:
offnet_usage_service_col = telecom_df_high_val_cust.columns[telecom_df_high_val_cust.columns.str.contains('offnet.*mou',regex=True)]

In [None]:
# Offnet mou values for churned and non churned customers
plot_box_chart('offnet_mou')

##### Customers distribution of the age on network

In [None]:
print(telecom_df_high_val_cust.aon.describe())


###### Minimun Age on network is 181 days
###### Average age on network for customers is 1212 days (3.2 years).
###### Around 25% of the HV users are in their 2nd year with the network.
###### Around 75% users have Age on network less than 4 years.
###### Around 15% users are with the network from over 7 years

### Modelling

##### Handling Class Imbalance using SMOTE

In [None]:
telecom_df_high_val_cust['churn'].value_counts().plot(kind = 'bar').set_title('churned')

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split


In [None]:
#Converting datetime to categories
telecom_df_high_val_cust['date_of_last_rech_6'] = telecom_df_high_val_cust['date_of_last_rech_6'].astype('category')
telecom_df_high_val_cust['date_of_last_rech_7'] = telecom_df_high_val_cust['date_of_last_rech_7'].astype('category')
telecom_df_high_val_cust['date_of_last_rech_8'] = telecom_df_high_val_cust['date_of_last_rech_8'].astype('category')

cat_columns = telecom_df_high_val_cust.select_dtypes(['category']).columns
telecom_df_high_val_cust[cat_columns] = telecom_df_high_val_cust[cat_columns].apply(lambda x: x.cat.codes)

In [None]:

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
# Separate input features and target
y = telecom_df_high_val_cust.churn
X = telecom_df_high_val_cust.drop('churn', axis=1)
# scaling the features
X_scaled = scale(X)
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.7, test_size=0.3, random_state=100)

sm = SMOTE(random_state=100, sampling_strategy=1.0)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
y_train.value_counts().plot(kind = 'bar').set_title('churned')

###### Performing PCA for feature reduction

In [None]:
X_train.shape

In [None]:

#Improting the PCA module
from sklearn.decomposition import PCA
pca = PCA(svd_solver='randomized', random_state=42)

In [None]:
#Doing the PCA on the train data
pca.fit(X_train)

In [None]:
pca.components_

In [None]:

pca.explained_variance_ratio_

In [None]:
#Making the screeplot - plotting the cumulative variance against the number of components
%matplotlib inline
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [None]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

######  Around 60 components are enough to describe 95% of the variance in the dataset. So We'll choose 60 components for our modeling

In [None]:
#Using incremental PCA for efficiency - saves a lot of time on larger datasets
from sklearn.decomposition import IncrementalPCA
pca_final = IncrementalPCA(n_components=60)

In [None]:
df_train_pca = pca_final.fit_transform(X_train)
df_train_pca.shape

In [None]:

#creating correlation matrix for the principal components
corrmat = np.corrcoef(df_train_pca.transpose())

In [None]:
corrmat

In [None]:
# 1s -> 0s in diagonals
corrmat_nodiag = corrmat - np.diagflat(corrmat.diagonal())
print("max corr:",corrmat_nodiag.max(), ", min corr: ", corrmat_nodiag.min(),)

In [None]:
#Applying selected components to the test data - 16 components
df_test_pca = pca_final.transform(X_test)
df_test_pca.shape

### Logisitc Regression

In [None]:
#Training the model on the train data
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

learner_pca = LogisticRegression()
model_pca = learner_pca.fit(df_train_pca,y_train)

In [None]:

y_train_pred = model_pca.predict_proba(df_train_pca)[:,1]
y_train_pred

In [None]:
y_train_pred_final = pd.DataFrame({'Churn':y_train, 'Churn_Prob':y_train_pred})
y_train_pred_final.head()

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]

for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:

# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

###### From the curve above, In between 0.50 and 0.60 is the optimum point to take it as a cutoff probability.Lets take 0.55

In [None]:

y_train_pred_final['final_predicted'] = y_train_pred_final.Churn_Prob.map( lambda x: 1 if x > 0.55 else 0)

y_train_pred_final.head()

In [None]:

confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.final_predicted )
confusion


In [None]:
def getModelMetrics(actual_churn=False,pred_churn=False):

    confusion = metrics.confusion_matrix(actual_churn, pred_churn)

    TP = confusion[1,1] # true positive 
    TN = confusion[0,0] # true negatives
    FP = confusion[0,1] # false positives
    FN = confusion[1,0] # false negatives

    print("Roc_auc_score : {}".format(metrics.roc_auc_score(actual_churn,pred_churn)))
    # Let's see the sensitivity of our logistic regression model
    print('Sensitivity/Recall : {}'.format(TP / float(TP+FN)))
    # Let us calculate specificity
    print('Specificity: {}'.format(TN / float(TN+FP)))
    # Calculate false postive rate - predicting churn when customer does not have churned
    print('False Positive Rate: {}'.format(FP/ float(TN+FP)))
    # positive predictive value 
    print('Positive predictive value: {}'.format(TP / float(TP+FP)))
    # Negative predictive value
    print('Negative Predictive value: {}'.format(TN / float(TN+ FN)))
    # sklearn precision score value 
    print('sklearn precision score value: {}'.format(metrics.precision_score(actual_churn, pred_churn )))

In [None]:
getModelMetrics(y_train_pred_final.Churn, y_train_pred_final.final_predicted )

In [None]:

#Making prediction on the test data
pred_probs_test = model_pca.predict_proba(df_test_pca)[:,1]
y_test_df=pd.DataFrame(y_test)
y_pred_df=pd.DataFrame(pred_probs_test)
y_test_df.reset_index(drop=True, inplace=True)
y_pred_df.reset_index(drop=True, inplace=True)
y_test_pred_final=pd.concat([y_test_df, y_pred_df],axis=1)
#"{:2.2}".format(metrics.roc_auc_score(y_test, pred_probs_test))

In [None]:
# Renaming the column 
y_test_pred_final= y_test_pred_final.rename(columns={ 0 : 'Churn_prob'})

In [None]:

# Renaming the column 
y_test_pred_final= y_test_pred_final.rename(columns={ 0 : 'Churn_prob'})

In [None]:
#y_test_pred_final.head()
y_test_pred_final['final_predicted'] = y_test_pred_final.Churn_prob.map(lambda x: 1 if x > 0.55 else 0)

In [None]:
confusion2 = metrics.confusion_matrix(y_test_pred_final.churn, y_test_pred_final.final_predicted )

In [None]:
getModelMetrics(y_test_pred_final.churn, y_test_pred_final.final_predicted )

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_test, pred_probs_test, drop_intermediate = False )

In [None]:
from sklearn import metrics

In [None]:

def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(6, 6))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return fpr, tpr, thresholds

In [None]:
draw_roc(y_test, pred_probs_test)

## SVM Model

In [None]:
# using rbf kernel, C=1, default value of gamma

model = SVC(C = 1, kernel='rbf')
model.fit(df_train_pca,y_train)
y_pred = model.predict(df_test_pca)

In [None]:
# confusion matrix
confusion_matrix(y_true=y_test, y_pred=y_pred)

In [None]:

# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred))

# precision
print("precision", metrics.precision_score(y_test, y_pred))

# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred))

###### Hyperparameter Tuning

In [None]:
# creating a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 4)

# specify range of hyperparameters
# Set the parameters by cross-validation
hyper_params = [ {'gamma': [1e-2, 1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}]


# specify model
model = SVC(kernel="rbf")

# set up GridSearchCV()
model_cv = GridSearchCV(estimator = model, 
                        param_grid = hyper_params, 
                        scoring= 'roc_auc', 
                        cv = folds,
                        n_jobs = -1,
                        verbose = 1,
                        return_train_score=True)      

# fit the model
model_cv.fit(df_train_pca,y_train)

In [None]:

# cv results
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

In [None]:
# printing the optimal roc score and hyperparameters
best_score = model_cv.best_score_
best_hyperparams = model_cv.best_params_

print("The best test score is {0} corresponding to hyperparameters {1}".format(best_score, best_hyperparams))

In [None]:
# specify optimal hyperparameters
best_params = {"C": 100, "gamma": 0.01, "kernel":"rbf"}

# model
model = SVC(C=100, gamma=0.01, kernel="rbf")

model.fit(df_train_pca,y_train)
y_pred = model.predict(df_test_pca)

# metrics
print(metrics.confusion_matrix(y_test, y_pred), "\n")
print("accuracy", metrics.accuracy_score(y_test, y_pred))
print("precision", metrics.precision_score(y_test, y_pred))
print("sensitivity/recall", metrics.recall_score(y_test, y_pred))
print("roc_auc_score", metrics.roc_auc_score(y_test, y_pred))

## Random Forest Model

In [None]:
# Importing random forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier

# Running the random forest with default parameters.
rfc = RandomForestClassifier()
# fit
rfc.fit(df_train_pca,y_train)

In [None]:
# Making predictions
predictions = rfc.predict(df_test_pca)

In [None]:
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

In [None]:
# Let's check the report of our default model
print(classification_report(y_test,predictions))

In [None]:

# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [4,8,10],
    'min_samples_leaf': range(100, 400, 200),
    'min_samples_split': range(200, 500, 200),
    'n_estimators': [100,200, 300], 
    'max_features': [3, 6]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,scoring= 'roc_auc', 
                          cv = 3, n_jobs = -1,verbose = 1)

# Fit the grid search to the data
grid_search.fit(df_train_pca,y_train)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get best score of',grid_search.best_score_,'using',grid_search.best_params_)

In [None]:

# model with the best hyperparameters
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(bootstrap=True,
                             max_depth=10,
                             min_samples_leaf=100, 
                             min_samples_split=200,
                             max_features=6,
                             n_estimators=200)

In [None]:
# fit
rfc.fit(df_train_pca,y_train)
# predict
predictions = rfc.predict(df_test_pca)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
# metrics
print(metrics.confusion_matrix(y_test, predictions), "\n")
print("accuracy", metrics.accuracy_score(y_test, predictions))
print("precision", metrics.precision_score(y_test, predictions))
print("sensitivity/recall", metrics.recall_score(y_test, predictions))
print("roc_auc_score", metrics.roc_auc_score(y_test, predictions))

## Conclusion Best Model is : Logisitc regression is best model we can use here . Since for logisitc We have Sensitivity/Recall : 0.7804107424960506
## Specificity: 0.8686150265097633 . 
## For Random forest and SVM we have good overall accuary around 90 but sensitivity is not as good as in logistic model. Here sensitivity(True Positive rate- identification of customer that are going to churn ) is more crucial metrics so we should opt for logisitic based on business case/ explainibility and metrics achieved

In [None]:

y = telecom_df_high_val_cust.churn
X = telecom_df_high_val_cust.drop('churn', axis=1)
# scaling the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled=scaler.fit_transform(X)
X_scaled_df=pd.DataFrame(X_scaled,columns=X.columns)
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, train_size=0.7, test_size=0.3, random_state=100)

In [None]:

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
from sklearn.feature_selection import RFE
rfe = RFE(logreg, 60)             # running RFE with 60 variables as output
rfe = rfe.fit(X_train, y_train)

In [None]:
rfe.support_


In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
X_train.columns[~rfe.support_]

In [None]:
X_train.reset_index()

In [None]:
# Importing random forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier

# Running the random forest with default parameters.
rfc = RandomForestClassifier()


In [None]:
rfc.fit(X_train,y_train)

In [None]:
# Making predictions
predictions = rfc.predict(X_test)

In [None]:
# Let's check the report of our default model
print(classification_report(y_test,predictions))

In [None]:
# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [8,12,16],
    'min_samples_leaf': range(100, 800, 200),
    'min_samples_split': range(200, 1000, 200),
    'n_estimators': [100,200, 300], 
    'max_features': [6,9,12]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,scoring= 'roc_auc', 
                          cv = 3, n_jobs = -1,verbose = 1)

# Fit the grid search to the data
grid_search.fit(X_train,y_train)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get best score of',grid_search.best_score_,'using',grid_search.best_params_)

In [None]:
# model with the best hyperparameters
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(bootstrap=True,
                             max_depth=12,
                             min_samples_leaf=100, 
                             min_samples_split=200,
                             max_features=12,
                             n_estimators=200)

In [None]:
# fit
rfc.fit(X_train,y_train)
# predict
predictions = rfc.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
plt.figure(figsize=(15,40))
feat_importances = pd.Series(rfc.feature_importances_, index=X.columns)
feat_importances.nlargest(len(X.columns)).sort_values().plot(kind='barh', align='center')

###### we can see above are the important factors to predict the churn . Top most 5 features are : total incoming , local incoming, average revenue per unit , maximum recharge , local incoming T2T