# Project Overview:
(Kaggle - credits to MEHMET URUC)

## Business Problem

### Context


Telco customer churn data includes information about a home phone and Internet service provider telecom company in California. It includes which customers have left, stayed or signed up for the service.



### Content

Each row represents a customer, each column contains customer’s attributes described on the column Metadata.




### About Data


customerID: Customer ID
Demograhic Information

gender: Whether the customer is a male or a female
SeniorCitizen: Whether the customer is a senior citizen or not (1, 0)
Partner: Whether the customer has a partner or not (Yes, No)
Dependents: Whether the customer has dependents or not (Yes, No)
Service Information

PhoneService: Whether the customer has a phone service or not (Yes, No)
MultipleLines: Whether the customer has multiple lines or not (Yes, No, No phone service)
InternetService: Customer’s internet service provider (DSL, Fiber optic, No)
OnlineSecurity: Whether the customer has online security or not (Yes, No, No internet service)
OnlineBackup: Whether the customer has online backup or not (Yes, No, No internet service)
DeviceProtection: Whether the customer has device protection or not (Yes, No, No internet service)
TechSupport: Whether the customer has tech support or not (Yes, No, No internet service)
StreamingTV: Whether the customer has streaming TV or not (Yes, No, No internet service)
StreamingMovies: Whether the customer has streaming movies or not (Yes, No, No internet service)
Customer account information

Contract: The contract term of the customer (Month-to-month, One year, Two year)
PaperlessBilling: Whether the customer has paperless billing or not (Yes, No)
PaymentMethod: The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
MonthlyCharges: The amount charged to the customer monthly
TotalCharges: The total amount charged to the customer
tenure: Number of months the customer has stayed with the company
Churn

Churn: Whether the customer churned or not (Yes or No)





In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action="ignore")


pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
df = pd.read_csv("Telco-Customer-Churn.csv")
df.head()

## Exploratory Data Analysis

In [None]:
def check_df(dataframe, head=5):
    print("############################################################## Shape ###############################################################")
    print(dataframe.shape)
    print("############################################################### Types ###############################################################")
    print(dataframe.dtypes)
    print("############################################################### Head ###############################################################")
    print(dataframe.head(head))
    print("############################################################### Tail ###############################################################")
    print(dataframe.tail(head))
    print("############################################################### NA ##############################################################")
    print(dataframe.isnull().sum())
    print("############################################################### Quantiles ###############################################################")
    print(dataframe.quantile([0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]).T)
    print("############################################################### NUMBERUNIQE ###############################################################")
    print(dataframe.nunique())

In [None]:
check_df(df)


In [None]:
# Change total charges type object to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')


In [None]:
# We need 1 and 0 instead of yes and no
df["Churn"] = df["Churn"].apply(lambda x : 1 if x == "Yes" else 0)

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)


#### Categoric Variables Analysis

In [None]:
def cat_summary(dataframe, col_name, plot=True):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("##########################################")
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show(block=True)

In [None]:
for col in cat_cols:
    cat_summary(df,col,plot=True)

#### Numeric Variables Analysis

In [None]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=20)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show()

In [None]:
for col in num_cols:
    num_summary(df, col, plot=True)

#### ANALYSIS OF NUMERICAL VARIABLES ACCORDING TO TARGET



In [None]:
def target_summary_with_num(dataframe, target, numerical_col):
    print(dataframe.groupby(target).agg({numerical_col: "mean"}), end="\n\n\n")

In [None]:
for col in num_cols:
    target_summary_with_num(df, "Churn", col)

def target_summary_with_cat(dataframe, target, categorical_col):
    print(categorical_col)
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean(),
                        "Count": dataframe[categorical_col].value_counts(),
                        "Ratio": 100 * dataframe[categorical_col].value_counts() / len(dataframe)}), end="\n\n\n")

In [None]:
for col in cat_cols:
    target_summary_with_cat(df, "Churn", col)

In [None]:
df[num_cols].corr()

In [None]:
df.corrwith(df["Churn"]).sort_values(ascending=False)

In [None]:
def correlation_matrix(df, cols):
    fig = plt.gcf()
    fig.set_size_inches(12, 10)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    fig = sns.heatmap(df[cols].corr(), annot=True, linewidths=0.5, annot_kws={'size': 12}, linecolor='w', cmap='RdBu')
    plt.show(block=True)

In [None]:
correlation_matrix(df, df.columns)

## Data Preprocessing & Feature Engineering


In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.95):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
def replace_with_thresholds(dataframe, variable, q1=0.05, q3=0.95):
    low_limit, up_limit = outlier_thresholds(dataframe, variable, q1=0.05, q3=0.95)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))
   

#### BASE MODEL



In [None]:
cat_cols = [col for col in cat_cols if col not in ["Churn"]]


In [None]:
df_base = df.copy()


In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

In [None]:
df_base = one_hot_encoder(df_base, cat_cols, drop_first=True)

In [None]:
y = df_base["Churn"]
X = df_base.drop(["Churn","customerID"], axis=1)

models = [('LR', LogisticRegression(random_state=12345)),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier(random_state=12345)),
          ('RF', RandomForestClassifier(random_state=12345)),
          ('XGB', XGBClassifier(random_state=12345)),
          ("LightGBM", LGBMClassifier(random_state=12345)),
          ("CatBoost", CatBoostClassifier(verbose=False, random_state=12345))]

In [None]:
for name, model in models:
    cv_results = cross_validate(model, X, y, cv=10, scoring=["accuracy", "f1", "roc_auc", "precision", "recall"])
    print(f"########## {name} ##########")
    print(f"Accuracy: {round(cv_results['test_accuracy'].mean(), 4)}")
    print(f"Auc: {round(cv_results['test_roc_auc'].mean(), 4)}")
    print(f"Recall: {round(cv_results['test_recall'].mean(), 4)}")
    print(f"Precision: {round(cv_results['test_precision'].mean(), 4)}")
    print(f"F1: {round(cv_results['test_f1'].mean(), 4)}")

In [None]:
df.loc[(df["PaymentMethod"] == "Electronic check") | (df["PaymentMethod"] == "Mailed check"), "PaymentMethodUnified" ] = "Check"
df.loc[~(df["PaymentMethod"] == "Electronic check") | (df["PaymentMethod"] == "Mailed check"), "PaymentMethodUnified" ] = "Automatic"

df['NEW_TotalServices'] = (df[['PhoneService', 'InternetService', 'OnlineSecurity',
                                       'OnlineBackup', 'DeviceProtection', 'TechSupport',
                                       'StreamingTV', 'StreamingMovies']]== 'Yes').sum(axis=1)


#Customer account information

df.loc[(df["Contract"] == "Month-to-month"), "Subscrition"] = "NoSubs"
df.loc[((df["Contract"] == "One year") | (df["Contract"] == "Two year")), "Subscrition"] = "Subs"

df["NewTenure"] = pd.qcut(df["tenure"], 4, labels=["Level1", "Level2", "Level3", "Level4"])

df["NewMonthlyCharges:"] = pd.qcut(df["MonthlyCharges"], 4, labels=["Level1", "Level2", "Level3", "Level4"])

df["NewTotalCharges:"] = pd.qcut(df["TotalCharges"], 4, labels=["Level1", "Level2", "Level3", "Level4"])#Customer account information

df.loc[(df["Contract"] == "Month-to-month"), "Subscrition"] = "NoSubs"
df.loc[((df["Contract"] == "One year") | (df["Contract"] == "Two year")), "Subscrition"] = "Subs"

df.loc[(df["tenure"]>=0) & (df["tenure"]<=12),"NewTenureYear"] = "0-1 Year"
df.loc[(df["tenure"]>12) & (df["tenure"]<=24),"NewTenureYear"] = "1-2 Year"
df.loc[(df["tenure"]>24) & (df["tenure"]<=36),"NewTenureYear"] = "2-3 Year"
df.loc[(df["tenure"]>36) & (df["tenure"]<=48),"NewTenureYear"] = "3-4 Year"
df.loc[(df["tenure"]>48) & (df["tenure"]<=60),"NewTenureYear"] = "4-5 Year"
df.loc[(df["tenure"]>60) & (df["tenure"]<=72),"NewTenureYear"] = "5-6 Year"

df["NewAutoPayment"] = df["PaymentMethod"].apply(lambda x: 1 if x in ["Bank transfer (automatic)","Credit card (automatic)"] else 0)


In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)


In [None]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe


In [None]:
binary_cols = [col for col in df.columns if df[col].dtypes == "O" and df[col].nunique() == 2]


for col in binary_cols:
    df = label_encoder(df, col)

df.info()

In [None]:
#cat_cols = [col for col in cat_cols if col not in binary_cols and col not in ["Churn"]]
#cat_cols
cat_cols = [col for col in cat_cols if  col not in ["Churn"]]
cat_cols

In [None]:
df = one_hot_encoder(df, cat_cols, drop_first=True)
df.head()