<a href="https://colab.research.google.com/github/gupta4327/telecomChurnPredictionmodel/blob/main/Telecom_churn_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <u><b> Objective </b></u>
## <b>To predict the whether a customer will churn or not, based on the variables available in the Telco customer churn data. </b>



In [None]:
#importing all necessary libraries and modules
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_curve,auc
from sklearn.preprocessing import StandardScaler
import math
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
#mounting a google drive 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#reading a csv file of dataset 
file_path = '/content/drive/MyDrive/Colab Notebooks/pandas/dat/Telco-Customer-Churn.csv'
dataset = pd.read_csv(file_path)

In [None]:
#to visualize first 5 observations 
dataset.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


***Data Dictionary***

1.   *Gender - Its self explanatory whether the end user is male or female.*

2.   *Senior Citizen - Whether a end user is senior citizen or not.*
 
3.   *Partner - Whether a user is single or they have partner(Married/live-in)*

4.   *Dependents - If user has any dependents(yes or no)*

5. *Tenure - Time period/years for which is user is using the service*

6. *Phone Service - If user is also using phone services or only internet services*

7. *Multiple Lines - Whether the user has multple lines or not(Yes, No or No Phone Service)*

8. *Internet Service - Method/ Kind of internet service user is using(DSL, Fibre optic,No)*

9. *OnlineSecurity - Customer has opted fore online secutrity or not(Yes or No)*

10. *OnlineBackUp - Customer has choosen onlineback up or n ot(Yes or No)*

11. *DeviceProtection - Customer has device protection or not (Yes or No)*

12. *TechSupport - If customer has taken the help of Tech Support* 

13. *Streaming TV - If customer has opted for TV streaming option* 

14. *Streaming Movies - If customer has opted for movie streaming option*

15. *Contract - User has choosen a year or mponth to month or two tear contract.*

16. *Paperless billing - If a customer have paperless billing or not (yes or no)*

17. *Payment method - Payment method customer uses - electronic check, mailed check, bank transfer or credit card*

18. *Monthly Charges - Monthly charge that a user is paying* 

19. *Total charges - Total charge thst a customer has paid till now.*

20. *Churn - Whether the customer has chuirn or not*






In [None]:
# basic information of features in dataset 
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
# checking for null values 
dataset.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

*so there are no null values in our dataset. Time to look for duplicates*

In [None]:
#finding no. of duplicate rows
len(dataset[dataset.duplicated()])

0

*No duplicate rows are there*

In [None]:
# for checking descriptive stats info of data 
dataset.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


*# If we check dataset.head and dataset.info, feature totalcharges shoud be of float type in actual but its of object type so we will converting it into float type*

In [None]:
#checking for spaces value in Totalcharges column 
dataset[dataset['TotalCharges'] == ' '].shape

(11, 21)

*# Replacing space values with np.nan*

In [None]:
#replacing null values 
dataset['TotalCharges'].replace({' ': np.nan}, inplace = True)

In [None]:
#verifying removal of spaces 
dataset[dataset['TotalCharges'] == ' '].shape

(0, 21)

*# so now we have dealt with space values. Now we'll convert the feature into float type*

In [None]:
#converting into float type
dataset['TotalCharges'] = dataset['TotalCharges'].astype('float64')

In [None]:
#percentage of null values in TotalCharges
print('percent of Null values in TotalCharges : ', (dataset['TotalCharges'].isnull().sum()/dataset.shape[0])*100)

percent of Null values in TotalCharges :  0.1561834445548772


*# As the percentage of null values is very minute just 0.15 percent so i'll go with dropping null values *

In [None]:
#dropping null values
dataset.dropna(inplace= True)

*# Now we'll visualize the stats table once again *

In [None]:
#descriptive stats info 
dataset.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7032.0,7032.0,7032.0,7032.0
mean,0.1624,32.421786,64.798208,2283.300441
std,0.368844,24.54526,30.085974,2266.771362
min,0.0,1.0,18.25,18.8
25%,0.0,9.0,35.5875,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.8625,3794.7375
max,1.0,72.0,118.75,8684.8


In [None]:
dataset.shape

(7032, 21)

*# Although Senior citizen is coming into numerical feature it is a categorical feature with values 0 and 1*

In [None]:
# checking for unique values and their counts in senior citizen features
dataset['SeniorCitizen'].value_counts()

0    5890
1    1142
Name: SeniorCitizen, dtype: int64

*# Now we are done with basic data cleaning process althouh techiques like outlier treatment hasn't done yet. We'look into it while moving forward. Now we will shift our focus to feature engineering and analysis*