 ## Steps involved to successfully predict customer churn:
   ### 1. Data Preprocessing
   ### 2. Data Evaluation
   ### 3. Model Selection
   ### 4. Model Evaluation
   ### 5. Model Improvement

### Data Preprocessing

In [1]:
# Importing the libraries

import pandas as pd
import seaborn as sn
import numpy as np
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt

import random
import os
import string
import time
from datetime import datetime
from time import time

#SimpleImuter class for handling the missing data in predictive model dataset. Replaces NaN values with a specified placeholder.
from sklearn.impute import SimpleImputer
#To convert categorical text data into model-understandable numerical data, we use the Label Encoder class
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv("churn_data.csv")
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [4]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


##### From the above details we can say that customers stay for around 32 months and pay 64 dollars monthly.

In [5]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
data.columns.to_series().groupby(data.dtypes).groups

{int64: ['SeniorCitizen', 'tenure'], float64: ['MonthlyCharges'], object: ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']}

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [8]:
data.isna().any()

customerID          False
gender              False
SeniorCitizen       False
Partner             False
Dependents          False
tenure              False
PhoneService        False
MultipleLines       False
InternetService     False
OnlineSecurity      False
OnlineBackup        False
DeviceProtection    False
TechSupport         False
StreamingTV         False
StreamingMovies     False
Contract            False
PaperlessBilling    False
PaymentMethod       False
MonthlyCharges      False
TotalCharges        False
Churn               False
dtype: bool

In [9]:
data["PaymentMethod"].nunique()

4

In [10]:
data["PaymentMethod"].unique()

array(['Electronic check', 'Mailed check', 'Bank transfer (automatic)',
       'Credit card (automatic)'], dtype=object)

In [11]:
data["Contract"].nunique()

3

In [12]:
data["Contract"].unique()

array(['Month-to-month', 'One year', 'Two year'], dtype=object)

In [13]:
data["Churn"].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

### Cleaning the dataset
##### Converting object to float64(numeric datatype)

In [14]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'],errors='coerce')

In [15]:
data['TotalCharges']

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: TotalCharges, Length: 7043, dtype: float64

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [17]:
data.isna().any()

customerID          False
gender              False
SeniorCitizen       False
Partner             False
Dependents          False
tenure              False
PhoneService        False
MultipleLines       False
InternetService     False
OnlineSecurity      False
OnlineBackup        False
DeviceProtection    False
TechSupport         False
StreamingTV         False
StreamingMovies     False
Contract            False
PaperlessBilling    False
PaymentMethod       False
MonthlyCharges      False
TotalCharges         True
Churn               False
dtype: bool

## Finding the missing values and filling it with the mean

In [18]:
na_columns = data.isna().any()
na_columns
na_columns = na_columns[na_columns == True].reset_index()
na_columns = na_columns["index"].tolist()

In [19]:
for col in data.columns[1:]:
    if col in na_columns:
        if data[col].dtype != 'object':
            data[col] = data[col].fillna(data[col].mean()).round(0)

In [20]:
data.isna().any()

customerID          False
gender              False
SeniorCitizen       False
Partner             False
Dependents          False
tenure              False
PhoneService        False
MultipleLines       False
InternetService     False
OnlineSecurity      False
OnlineBackup        False
DeviceProtection    False
TechSupport         False
StreamingTV         False
StreamingMovies     False
Contract            False
PaperlessBilling    False
PaymentMethod       False
MonthlyCharges      False
TotalCharges        False
Churn               False
dtype: bool

## Label Encoding the Binary data

In [21]:
#Create a label encoder object
le = LabelEncoder()
# Label Encoding will be used for columns with 2 or less unique 
le_count = 0
for col in data.columns[1:]:
    if data[col].dtype == 'object':
        if len(list(data[col].unique())) <= 2:
            le.fit(data[col])
            data[col] = le.transform(data[col])
            le_count += 1
print('{} columns were label encoded.'.format(le_count))

6 columns were label encoded.


In [22]:
print(l_count, "columns were label encoded")

NameError: name 'l_count' is not defined

## Data Evaluation/ Exploratory data analysis

### 1. Plotting histogram of Numeric columns

In [None]:
data.dtypes

In [None]:
data[['SeniorCitizen','tenure','MonthlyCharges','TotalCharges']].describe()

#### From the above data we can say that people after a tenure of 32 months, having monthly charges around 64 get churned.

In [None]:
# Finding the number of customers that churned
data['Churn'].value_counts()

#### Here 5174 people have not churned while 1869 people have churned.Let’s show this count visually using a bar plot.

In [None]:
sn.countplot(data['Churn'])

In [None]:
# Getting the percentage of people who left.

retained_cust = data[data.Churn == 0]
left_cust = data[data.Churn == 1]

retained = retained_cust.shape[0]
left = left_cust.shape[0]

retained_per = (retained/(retained + left)) * 100
left_per = (left/(retained + left)) * 100

In [None]:
print(retained_per, "people did not leave.")
print(left_per, "People left the company.")


#### So, about 73.46% of the customers stayed or were retained and about 26.54% of the customers churned. 

In [None]:
sn.countplot(x='gender', hue="Churn", data=data)


#### From the plot above, it looks like gender does not play a role in customer churn.

In [None]:
sn.countplot(x='InternetService', hue='Churn', data=data)

#### The conclusion that can be made here is Maybe the company should only provide DSL as the internet service or stop providing Fiber optics for it’s internet service.

In [None]:
numerical_features = ['tenure','MonthlyCharges']
fig, ax = plt.subplots(2,1, figsize=(20, 18))
data[data.Churn == 1][numerical_features].hist(bins=20, color="red",alpha=0.5, ax=ax)
data[data.Churn == 0][numerical_features].hist(bins=20, color="blue", alpha=0.3, ax=ax)

#### The monthly charges chart shows that most of the loyal customers that stayed with the company had a monthly charge between $20 and $30. Most of the customers that churned had a monthly charge of $70 to $100. Maybe the company should lower the monthly charges to retain customers.
#### From the chart, I can see that most of the customers that churned had between 1 and 9 months with the company, while most of the retained customers had a tenure between 24 and 72 months which is 2 to 6 years. So, it may be in the companies best interest to try everything they can to keep their customers for at least 2 years.

In [None]:
data.shape

In [None]:
data.dtypes

## Converting all the non-numeric columns to numerical data types

In [None]:
cleaned_data = data.drop('customerID',axis=1)

In [None]:
for column in cleaned_data.columns:
    if cleaned_data[column].dtypes == np.number:
        continue
    else:
        cleaned_data[column] = LabelEncoder().fit_transform(cleaned_data[column])
        

In [None]:
cleaned_data.head(10)

In [None]:
cleaned_data.dtypes

## Scaling the data

In [None]:
X = cleaned_data.drop('Churn',axis =1)
Y = cleaned_data["Churn"]

In [None]:
X = StandardScaler().fit_transform(X)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y,test_size = 0.2, random_state = 42)

In [None]:
# Create the model
model = LogisticRegression()
model.fit(x_train, y_train)

In [None]:
predictions = model.predict(x_test)
print(predictions)

In [None]:
print(classification_report(y_test, predictions))