# Modelling using Gaussian Naive Bayes

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
tc_data = pd.read_csv("/home2/MyNotes/Python/DataSets/CustomerChurn/telecomChurn.csv",
                             sep=",",
                             header=0)
tc_data.head(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,TechSupportYes,StreamingTVNo internet service,StreamingTVYes,StreamingMoviesNo internet service,StreamingMoviesYes,ContractOne year,ContractTwo year,PaymentMethodCredit card (automatic),PaymentMethodElectronic check,PaymentMethodMailed check
0,0,0,1,1,45,1,1,81.0,3533.6,No,...,1,0,1,0,1,0,1,0,0,1
1,1,0,0,0,4,1,1,81.0,340.85,Yes,...,0,0,1,0,0,0,0,0,1,0
2,0,0,1,1,18,1,1,20.25,401.95,No,...,0,1,0,1,0,0,0,0,0,1
3,0,0,1,0,53,1,1,93.45,4872.2,No,...,1,0,1,0,1,1,0,0,1,0
4,0,1,1,0,3,1,1,74.6,239.05,No,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Looks like these are new customers as the tenure is zero. We can update the MonthlyCharges into TotalCharges
tc_data["TotalCharges"] = tc_data["TotalCharges"].fillna(value=tc_data["MonthlyCharges"])

In [3]:
tc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4930 entries, 0 to 4929
Data columns (total 31 columns):
gender                                  4930 non-null int64
SeniorCitizen                           4930 non-null int64
Partner                                 4930 non-null int64
Dependents                              4930 non-null int64
tenure                                  4930 non-null int64
PhoneService                            4930 non-null int64
PaperlessBilling                        4930 non-null int64
MonthlyCharges                          4930 non-null float64
TotalCharges                            4930 non-null float64
Churn                                   4930 non-null object
MultipleLinesNo phone service           4930 non-null int64
MultipleLinesYes                        4930 non-null int64
InternetServiceFiber optic              4930 non-null int64
InternetServiceNo                       4930 non-null int64
OnlineSecurityNo internet service       4930 non

In [4]:
tc_data.head(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,TechSupportYes,StreamingTVNo internet service,StreamingTVYes,StreamingMoviesNo internet service,StreamingMoviesYes,ContractOne year,ContractTwo year,PaymentMethodCredit card (automatic),PaymentMethodElectronic check,PaymentMethodMailed check
0,0,0,1,1,45,1,1,81.0,3533.6,No,...,1,0,1,0,1,0,1,0,0,1
1,1,0,0,0,4,1,1,81.0,340.85,Yes,...,0,0,1,0,0,0,0,0,1,0
2,0,0,1,1,18,1,1,20.25,401.95,No,...,0,1,0,1,0,0,0,0,0,1
3,0,0,1,0,53,1,1,93.45,4872.2,No,...,1,0,1,0,1,1,0,0,1,0
4,0,1,1,0,3,1,1,74.6,239.05,No,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Treating categorical variables
Churn_f = pd.get_dummies(tc_data["Churn"],drop_first=True,prefix="Churn")
tc_data_clean = pd.concat([tc_data,Churn_f],axis=1)

In [6]:
tc_data_clean.head(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,StreamingTVNo internet service,StreamingTVYes,StreamingMoviesNo internet service,StreamingMoviesYes,ContractOne year,ContractTwo year,PaymentMethodCredit card (automatic),PaymentMethodElectronic check,PaymentMethodMailed check,Churn_Yes
0,0,0,1,1,45,1,1,81.0,3533.6,No,...,0,1,0,1,0,1,0,0,1,0
1,1,0,0,0,4,1,1,81.0,340.85,Yes,...,0,1,0,0,0,0,0,1,0,1
2,0,0,1,1,18,1,1,20.25,401.95,No,...,1,0,1,0,0,0,0,0,1,0
3,0,0,1,0,53,1,1,93.45,4872.2,No,...,0,1,0,1,1,0,0,1,0,0
4,0,1,1,0,3,1,1,74.6,239.05,No,...,0,0,0,0,0,0,0,0,0,0


In [7]:
tc_data_features = tc_data_clean.drop("Churn",axis=1)
tc_data_target = tc_data_clean["Churn_Yes"]

In [8]:
print("Sizes:{} {}".format(tc_data_features["gender"].count(),tc_data_target.size))
print("Types:{} {}".format(type(tc_data_features),type(tc_data_target)))

Sizes:4930 4930
Types:<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tc_data_features,tc_data_target, test_size=0.30, random_state=101)

In [10]:
type(tc_data_features),type(tc_data_target)


(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [11]:
tc_data_features_ = tc_data_features.values
tc_data_target_ = tc_data_target.values

In [12]:
from sklearn.model_selection import StratifiedShuffleSplit 
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42) # Want a balanced split for all the classes
for train_index, test_index in sss.split(tc_data_features_, tc_data_target_):
    print("Using {} for training and {} for validation".format(len(train_index), len(test_index)))
    X_train, X_test = tc_data_features_[train_index], tc_data_features_[test_index]
    y_train, y_test = tc_data_target_[train_index], tc_data_target_[test_index]

Using 3697 for training and 1233 for validation


In [14]:
from sklearn.naive_bayes import GaussianNB
NBModel = GaussianNB()
NBModel.fit(X_train,y_train)

GaussianNB(priors=None)

In [15]:
predictions = NBModel.predict(X_test)

In [16]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       913
          1       1.00      1.00      1.00       320

avg / total       1.00      1.00      1.00      1233

[[913   0]
 [  0 320]]
