# Telecom Customer Churn  Project

## Data: 
* We have each row representing each customer.
* Customers who left the service is categorized as customer churn
* Services that each customer has signed up for – phone, multiple lines, internet, online * security, online backup, device protection, tech support, and streaming TV and movies
* Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
* Demographic info about customers – gender, age range, and if they have partners and dependents
* Churn - dependent feature ('Yes' denotes customers left, 'No' denotes customer stay here)

source: https://www.kaggle.com/datasets/blastchar/telco-customer-churn

## Importing Libraries

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import recall_score, accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
from sklearn.feature_selection import SelectKBest
#from collections import Counter

# ignore warning
import warnings
warnings.filterwarnings('ignore')
import matplotlib.ticker as mtick

## Loading the Dataset

In [2]:
data = pd.read_csv("data/Telco-Customer-Churn.csv")


### Data Lookup

In [3]:
data.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
print(data.dtypes)
print(data.shape)

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
(7043, 21)


## Exploratory Data Analysis

In [5]:
# Lets look at the target variable

sns.countplot(data,x="Churn")
plt.title("Distribution of Churn")
plt.show()

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# Get the percentage of churn
data["Churn"].value_counts()/data["Churn"].value_counts().sum()

### Numerical Analysis

In [None]:
print(data["TotalCharges"].value_counts())
# We spotted a blank in total_charges 

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
# We change the datatype to numeric

# from the data, getting the numerical columns
n_cols = [i for i in data.columns if data[i].dtypes != 'O']
print(n_cols)

print(data['SeniorCitizen'].value_counts())
# We saw that Senior Citizen is 0 and 1 representing no and yes, so it is not the numerical variable, we will remove it.

n_cols.remove('SeniorCitizen')
print(n_cols)

In [None]:
# Observing the distribution underneath the Continuous variables

for feature in n_cols:
    plt.figure(figsize=(15,7))

    plt.subplot(1, 3, 1)
    sns.histplot(data=data, x=feature, bins=30, kde=True)
    plt.title('Histogram')

    plt.subplot(1, 3, 2)
    stats.probplot(data[feature], dist="norm", plot=plt)
    plt.ylabel('RM quantiles')

    plt.subplot(1, 3, 3)
    sns.boxplot(y=data[feature])
    plt.title('Boxplot')
    
plt.show()

In [None]:
# distribution with our target variable
sns.pairplot(data.drop(columns='SeniorCitizen'),hue='Churn', kind='scatter')
plt.show()

### Inference:
* Majority of the data has either tenure of 0 or 70. Distribution is not normal
* Majority of users pay 20 as a Monthly and roughtly 1500 as Total charges
* No Major outliers detected in the dataset

In [None]:
# pie chart for Count of Senior citizens
ax = (data['SeniorCitizen'].value_counts()/len(data)).plot.pie(autopct='%.1f%%', labels = ['No Churn', 'Churned'],figsize =(5,5), fontsize = 12)                                                                           
ax.yaxis.set_major_formatter(mtick.PercentFormatter())

ax.set_title('% of Senior Citizens', fontsize = 12)

In [None]:
### Bivariate analysis

# categorical columns
c_cols = [i for i in data.columns if data[i].dtype == 'O']
c_cols.remove('customerID')

for i, feature in enumerate(c_cols):
    plt.figure(i)
    plt.figure(figsize=(12,6))
    sns.countplot(data=data, x=feature, hue='Churn')
plt.show()

# We will remove churn since it is a categorical variable
c_cols.remove("Churn")

### Inference:
* Some features that are having dispropotionate churn  -- Partner, dependent, phoneservice, fiberoptics, deviceprotection, techsupport electroniccheck, and contract. It is key to observe these features.

## Missing Values

In [None]:
data.isnull().sum()

In [None]:
# Let's analyze total charges
sns.distplot(data["TotalCharges"])

# We will fill in the missing values by Median since the distribution is not the same.
data["TotalCharges"] = data["TotalCharges"].fillna(data["TotalCharges"].median())



In [None]:
Churn = data[data.Churn=="Yes"]
Not_Churn = data[data.Churn=="No"]


fig, ax = plt.subplots()
sns.kdeplot(Churn["TotalCharges"],label = "Churn", ax= ax)
sns.kdeplot(Not_Churn["TotalCharges"], label = "Not Churn", ax=ax)
ax.set_xlabel("Total Charges");



In [None]:
sns.kdeplot(Churn["MonthlyCharges"], label = "Churn")
sns.kdeplot(Not_Churn["MonthlyCharges"], label = "Not Churn")

In [None]:
data.head()

In [None]:
# To observe a dependencies on the categorical vs categorical variables, we will use chi-square test
# We will convert two columns into numpy array

# We will store all the p_values from chisquare test
p_vals={}
for i in c_cols:
    observed = pd.crosstab(data[i], data['Churn'])
    
    chi2, p, dof, expected = stats.chi2_contingency(observed)
    
    p_vals[i] = [p]
p_vals = pd.DataFrame(p_vals)
p_vals = p_vals.T
p_vals = p_vals.rename(columns={0:"p_val"})
x = p_vals.sort_values(by="p_val", ascending=True)
x


In [None]:
df = data.copy()
def dummies(x,df):
    temp = pd.get_dummies(df[x], prefix=x, drop_first = True)
    df = pd.concat([df, temp], axis = 1)
    df.drop([x], axis = 1, inplace = True)
    return df

for i in c_cols:
    df = dummies(i, df)

for i in data.columns:
    df = df.rename(columns={i:i.replace(" ","_")})

In [None]:
df.dtypes

In [None]:
n_cols = [x for x in df.columns if df[x].dtype == "bool"]

In [None]:
p_vals={}
for i in n_cols:
    observed = pd.crosstab(df[i], df['Churn'])
    
    chi2, p, dof, expected = stats.chi2_contingency(observed)
    
    p_vals[i] = [p]
p_vals = pd.DataFrame(p_vals)
p_vals = p_vals.T
p_vals = p_vals.rename(columns={0:"p_val"})
x = p_vals.sort_values(by="p_val", ascending=True)
x

In [None]:
# Output Final Model after EDA
def datapreparation(filepath):
    
    df = pd.read_csv(filepath)
    df.drop(["customerID"], inplace = True, axis = 1)
    
    df.TotalCharges = df.TotalCharges.replace(" ",np.nan)
    df.TotalCharges.fillna(0, inplace = True)
    df.TotalCharges = df.TotalCharges.astype(float)
    
    cols1 = ['Partner', 'Dependents', 'PaperlessBilling', 'Churn', 'PhoneService']
    for col in cols1:
        df[col] = df[col].apply(lambda x: 0 if x == "No" else 1)
   
    df.gender = df.gender.apply(lambda x: 0 if x == "Male" else 1)
    df.MultipleLines = df.MultipleLines.map({'No phone service': 0, 'No': 0, 'Yes': 1})
    
    cols2 = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
    for col in cols2:
        df[col] = df[col].map({'No internet service': 0, 'No': 0, 'Yes': 1})
    
    df = pd.get_dummies(df, columns=['InternetService', 'Contract', 'PaymentMethod'], drop_first=True)
    
    return df

df = datapreparation(filepath = "data/Telco-Customer-Churn.csv")
df.to_csv("data/cleaned_data.csv", index=False)