In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score,classification_report, confusion_matrix, f1_score
from sklearn.model_selection import RandomizedSearchCV
%matplotlib inline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [2]:
df = pd.read_csv('churn_data.csv')
df

Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,45,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,2,Yes,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...
7037,2569-WGERO,72,Yes,Two year,Yes,Bank transfer (automatic),21.15,1419.4,No
7038,6840-RESVB,24,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,72,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,11,No,Month-to-month,Yes,Electronic check,29.60,346.45,No


In [3]:
churdf = df.copy()

In [4]:
churdf.drop(['customerID'], axis=1, inplace=True)

In [5]:
churdf

Unnamed: 0,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,45,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,2,Yes,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...
7037,72,Yes,Two year,Yes,Bank transfer (automatic),21.15,1419.4,No
7038,24,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,72,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,11,No,Month-to-month,Yes,Electronic check,29.60,346.45,No


In [6]:
churdf.isna().sum()

tenure              0
PhoneService        0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [7]:
churdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7042 entries, 0 to 7041
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tenure            7042 non-null   int64  
 1   PhoneService      7042 non-null   object 
 2   Contract          7042 non-null   object 
 3   PaperlessBilling  7042 non-null   object 
 4   PaymentMethod     7042 non-null   object 
 5   MonthlyCharges    7042 non-null   float64
 6   TotalCharges      7042 non-null   object 
 7   Churn             7042 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 440.2+ KB


In [8]:
churdf['PaperlessBilling'].value_counts()

Yes    4170
No     2872
Name: PaperlessBilling, dtype: int64

In [9]:
churdf.replace({'PhoneService':{'No':0, 'Yes':1},
                      'PaperlessBilling':{'Yes':1,'No':0}},inplace = True)

In [10]:
churdf['Contract'].unique()

array(['Month-to-month', 'One year', 'Two year'], dtype=object)

In [11]:
churdf1 = pd.get_dummies(churdf['Contract'],drop_first = True)
churdf

Unnamed: 0,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,Month-to-month,1,Electronic check,29.85,29.85,No
1,34,1,One year,0,Mailed check,56.95,1889.5,No
2,2,1,Month-to-month,1,Mailed check,53.85,108.15,Yes
3,45,0,One year,0,Bank transfer (automatic),42.30,1840.75,No
4,2,1,Month-to-month,1,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...
7037,72,1,Two year,1,Bank transfer (automatic),21.15,1419.4,No
7038,24,1,One year,1,Mailed check,84.80,1990.5,No
7039,72,1,One year,1,Credit card (automatic),103.20,7362.9,No
7040,11,0,Month-to-month,1,Electronic check,29.60,346.45,No


In [12]:
churdf = pd.concat([churdf,churdf1],axis = 1)
churdf

Unnamed: 0,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,One year,Two year
0,1,0,Month-to-month,1,Electronic check,29.85,29.85,No,0,0
1,34,1,One year,0,Mailed check,56.95,1889.5,No,1,0
2,2,1,Month-to-month,1,Mailed check,53.85,108.15,Yes,0,0
3,45,0,One year,0,Bank transfer (automatic),42.30,1840.75,No,1,0
4,2,1,Month-to-month,1,Electronic check,70.70,151.65,Yes,0,0
...,...,...,...,...,...,...,...,...,...,...
7037,72,1,Two year,1,Bank transfer (automatic),21.15,1419.4,No,0,1
7038,24,1,One year,1,Mailed check,84.80,1990.5,No,1,0
7039,72,1,One year,1,Credit card (automatic),103.20,7362.9,No,1,0
7040,11,0,Month-to-month,1,Electronic check,29.60,346.45,No,0,0


In [13]:
churdf.drop(['Contract'], axis=1, inplace=True)

In [14]:
churdf['PaymentMethod'].unique()

array(['Electronic check', 'Mailed check', 'Bank transfer (automatic)',
       'Credit card (automatic)'], dtype=object)

In [15]:
paymntdf1 = pd.get_dummies(churdf['PaymentMethod'],drop_first = True)
paymntdf1

Unnamed: 0,Credit card (automatic),Electronic check,Mailed check
0,0,1,0
1,0,0,1
2,0,0,1
3,0,0,0
4,0,1,0
...,...,...,...
7037,0,0,0
7038,0,0,1
7039,1,0,0
7040,0,1,0


In [16]:
churdf = pd.concat([churdf,paymntdf1],axis = 1)
churdf

Unnamed: 0,tenure,PhoneService,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,One year,Two year,Credit card (automatic),Electronic check,Mailed check
0,1,0,1,Electronic check,29.85,29.85,No,0,0,0,1,0
1,34,1,0,Mailed check,56.95,1889.5,No,1,0,0,0,1
2,2,1,1,Mailed check,53.85,108.15,Yes,0,0,0,0,1
3,45,0,0,Bank transfer (automatic),42.30,1840.75,No,1,0,0,0,0
4,2,1,1,Electronic check,70.70,151.65,Yes,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
7037,72,1,1,Bank transfer (automatic),21.15,1419.4,No,0,1,0,0,0
7038,24,1,1,Mailed check,84.80,1990.5,No,1,0,0,0,1
7039,72,1,1,Credit card (automatic),103.20,7362.9,No,1,0,1,0,0
7040,11,0,1,Electronic check,29.60,346.45,No,0,0,0,1,0


In [17]:
churdf.drop(['PaymentMethod'], axis=1, inplace=True)

In [18]:
# churdf.replace({'Churn':{'No':0, 'Yes':1}},inplace = True)
# churdf

In [19]:
# churdf['TotalCharges']= churdf['TotalCharges']

df2 = churdf.apply(pd.to_numeric, errors='coerce')

# pd.to_numeric(s, errors='coerce').fillna(0, downcast='infer')
# df
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7042 entries, 0 to 7041
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tenure                   7042 non-null   int64  
 1   PhoneService             7042 non-null   int64  
 2   PaperlessBilling         7042 non-null   int64  
 3   MonthlyCharges           7042 non-null   float64
 4   TotalCharges             7031 non-null   float64
 5   Churn                    0 non-null      float64
 6   One year                 7042 non-null   uint8  
 7   Two year                 7042 non-null   uint8  
 8   Credit card (automatic)  7042 non-null   uint8  
 9   Electronic check         7042 non-null   uint8  
 10  Mailed check             7042 non-null   uint8  
dtypes: float64(3), int64(3), uint8(5)
memory usage: 364.6 KB


In [20]:
df2['Churn'] = churdf['Churn']


In [21]:
df2.isna().sum()

tenure                      0
PhoneService                0
PaperlessBilling            0
MonthlyCharges              0
TotalCharges               11
Churn                       0
One year                    0
Two year                    0
Credit card (automatic)     0
Electronic check            0
Mailed check                0
dtype: int64

In [22]:
df2['TotalCharges'] = df2['TotalCharges'].fillna(df2['TotalCharges'].median())

In [23]:
x = df2.drop(['Churn'], axis=1)
y = df2['Churn']

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=1,test_size=0.2)

In [25]:
RandmFClas = RandomForestClassifier(random_state=1)

In [26]:
RandmFClas.fit(x_train, y_train)

RandomForestClassifier(random_state=1)

In [27]:
# y_pred = RandmFClas.predict()
# Testing Data Accuracy
y_pred = RandmFClas.predict(x_test)
testing_accuracy = accuracy_score(y_test,y_pred)
print('Testing Accuracy is :',testing_accuracy)

Testing Accuracy is : 0.7814052519517388


In [28]:
testing_class = classification_report(y_test,y_pred)


In [29]:
print(testing_class)

              precision    recall  f1-score   support

          No       0.83      0.88      0.86      1043
         Yes       0.60      0.49      0.54       366

    accuracy                           0.78      1409
   macro avg       0.71      0.69      0.70      1409
weighted avg       0.77      0.78      0.77      1409



# Now Training the dataset by passing hyperparameters

In [30]:
hyperparameters = {"n_estimators": np.arange(5,100),
                  "criterion":['gini','entropy'],
                  'max_depth': np.arange(2,10),
                  'min_samples_split':np.arange(2,15),
                  'min_samples_leaf':np.arange(1,10)}

RFclas_model = RandomForestClassifier(random_state=1)
best_RFclas_model = RandomizedSearchCV(RFclas_model,hyperparameters, cv = 5)
best_RFclas_model.fit(x_train,y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=1),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': array([2, 3, 4, 5, 6, 7, 8, 9]),
                                        'min_samples_leaf': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                                        'min_samples_split': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'n_estimators': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
       56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
       73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
       90, 91, 92, 93, 94, 95, 96, 97, 98, 99])})

In [31]:
best_RFclas_model.best_params_

{'n_estimators': 77,
 'min_samples_split': 3,
 'min_samples_leaf': 8,
 'max_depth': 9,
 'criterion': 'gini'}

In [32]:
hyperparameters = {"n_estimators": [31],
                  "criterion":['gini'],
                  'max_depth': [7],
                  'min_samples_split':[4],
                  'min_samples_leaf':[1]}

RFclas_model = RandomForestClassifier(random_state=1)
best_RFclas_model = RandomizedSearchCV(RFclas_model,hyperparameters, cv = 5)
best_RFclas_model.fit(x_train,y_train)



RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=1),
                   param_distributions={'criterion': ['gini'], 'max_depth': [7],
                                        'min_samples_leaf': [1],
                                        'min_samples_split': [4],
                                        'n_estimators': [31]})

In [33]:
y_pred = best_RFclas_model.predict(x_test)
testing_accuracy = accuracy_score(y_test,y_pred)
print('Testing Accuracy is :',testing_accuracy)

Testing Accuracy is : 0.8026969481902059


# building model with Regression 

### considering target column ast Total charges

In [34]:
# first replace churn values to int or float

In [35]:
df2['Churn'] = df2['Churn'].replace({'Yes':1,'No':0})
# churdf.replace({'Churn':{'No':0, 'Yes':1}},inplace = True)
df2

Unnamed: 0,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,One year,Two year,Credit card (automatic),Electronic check,Mailed check
0,1,0,1,29.85,29.85,0,0,0,0,1,0
1,34,1,0,56.95,1889.50,0,1,0,0,0,1
2,2,1,1,53.85,108.15,1,0,0,0,0,1
3,45,0,0,42.30,1840.75,0,1,0,0,0,0
4,2,1,1,70.70,151.65,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
7037,72,1,1,21.15,1419.40,0,0,1,0,0,0
7038,24,1,1,84.80,1990.50,0,1,0,0,0,1
7039,72,1,1,103.20,7362.90,0,1,0,1,0,0
7040,11,0,1,29.60,346.45,0,0,0,0,1,0


In [36]:
x = df2.drop(['TotalCharges'], axis=1)
y = df2['TotalCharges']

In [53]:
y_train


4169    1457.25
3571    2096.10
1352    4549.05
1278    2234.55
938     7118.90
         ...   
6443     150.35
3606    7069.30
5704    1564.40
6637    3804.40
2575     617.65
Name: TotalCharges, Length: 5633, dtype: float64

In [50]:
x_train

Unnamed: 0,tenure,PhoneService,PaperlessBilling,MonthlyCharges,Churn,One year,Two year,Credit card (automatic),Electronic check,Mailed check
4169,15,1,1,101.25,1,0,0,0,1,0
3571,35,1,1,62.10,0,0,0,0,1,0
1352,72,0,0,60.95,0,0,1,0,0,0
1278,36,0,0,60.70,0,1,0,1,0,0
938,65,1,1,108.05,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
6443,9,1,0,19.50,0,0,0,0,0,0
3606,70,1,1,101.75,0,1,0,0,1,0
5704,19,1,1,86.85,0,0,0,0,1,0
6637,69,1,1,53.65,0,1,0,1,0,0


In [37]:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=2,test_size=0.2)

In [38]:
RandmFReg= RandomForestRegressor(random_state=2)

In [39]:
RandmFReg.fit(x_train, y_train)

RandomForestRegressor(random_state=2)

In [40]:
y_pred = RandmFReg.predict(x_test)

In [41]:
y_pred[0:5]

array([  38.09155, 1350.0705 , 3432.402  , 1626.168  ,  511.9965 ])

In [42]:
y_test[0:5]

5806      35.85
3678    1398.25
4060    3409.10
1577    1704.95
5007     541.50
Name: TotalCharges, dtype: float64

In [43]:

mse_test = mean_squared_error(y_pred,y_test)
print("Mean squared error for testing data is :",mse_test)
print("Root Mean squared error for testing data is :",np.sqrt(mse_test))
print("R2 score is : ",r2_score(y_pred,y_test))

Mean squared error for testing data is : 7123.478310039
Root Mean squared error for testing data is : 84.40070088594643
R2 score is :  0.9985581700663435


# now passing hyperparameters for regression training

In [44]:
hyperparameters = {"n_estimators": np.arange(5,100),
                  "criterion":['mse','mae'],
                  'max_depth': np.arange(2,10),
                  'min_samples_split':np.arange(2,15),
                  'min_samples_leaf':np.arange(1,10)}

RFreg_model = RandomForestRegressor(random_state=2)
best_RFreg_model = RandomizedSearchCV(RFreg_model,hyperparameters, cv = 5)
best_RFreg_model.fit(x_train,y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=2),
                   param_distributions={'criterion': ['mse', 'mae'],
                                        'max_depth': array([2, 3, 4, 5, 6, 7, 8, 9]),
                                        'min_samples_leaf': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                                        'min_samples_split': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'n_estimators': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
       56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
       73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
       90, 91, 92, 93, 94, 95, 96, 97, 98, 99])})

In [45]:
best_RFreg_model.best_params_

{'n_estimators': 71,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_depth': 8,
 'criterion': 'mae'}

In [46]:
hyperparameters = {"n_estimators": [86],
                  "criterion":['mse'],
                  'max_depth': [8],
                  'min_samples_split':[14],
                  'min_samples_leaf':[3]}

RFreg_model = RandomForestRegressor(random_state=1)
best_RFreg_model = RandomizedSearchCV(RFreg_model,hyperparameters, cv = 5)
best_RFreg_model.fit(x_train,y_train)



RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=1),
                   param_distributions={'criterion': ['mse'], 'max_depth': [8],
                                        'min_samples_leaf': [3],
                                        'min_samples_split': [14],
                                        'n_estimators': [86]})

In [47]:
y_pred_hyp = best_RFreg_model.predict(x_test)
mse_test = mean_squared_error(y_pred_hyp,y_test)
print("Mean squared error for testing data is :",mse_test)
print("Root Mean squared error for testing data is :",np.sqrt(mse_test))
print("R2 score is : ",r2_score(y_pred_hyp,y_test))

Mean squared error for testing data is : 8426.205134823873
Root Mean squared error for testing data is : 91.79436330638103
R2 score is :  0.9982909076830674


# Pickle File regression


In [48]:
import pickle

file = 'ML_Model.pkl'
with open(file, 'wb') as f:
    pickle.dump(RandmFReg, f)

In [49]:
with open(file, 'rb') as f:
    k = pickle.load(f)
    

In [51]:
cy = k.predict([[15,1,1,101.25,1,0,0,0,1,0]])
print(cy)

[1495.0045]


# Pickle File classification



In [None]:
import pickle

file = 'cls_model.pkl'
with open(file, 'wb') as f:
    pickle.dump(RandmFReg, f)