Import Libraries

In [2]:
# data processing
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# cross validation
from sklearn.model_selection import train_test_split

#ML algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

# import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

# model optimization
from sklearn.model_selection import GridSearchCV

In [6]:
csv = r"../raw/Test.csv"
df = pd.read_csv(csv)
df


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [None]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')
df["SeniorCitizen"] = df["SeniorCitizen"].astype("O")
df["Churn"] = df["Churn"].apply(lambda x : 1 if x == "Yes" else 0)

In [None]:
def check_col(name, data):
    types = []
    count = {}
    for val in data[name]:
        if val not in types:
            types.append(val)
            count[val] = 1
        else:   
            count[val] += 1
    return types

In [None]:
to_drop = ["tenure", "OnlineBackup", "DeviceProtection","Contract", "PaperlessBilling", "PaymentMethod"]
df = df.drop(to_drop, axis=1)
df["MultipleLines"] = df["MultipleLines"].replace({"No": 0.7, "No phone service": 0})
df = df.replace({"Yes": 1, "No": 0, "Male": 1, "Female": 0})
types = check_col("InternetService", df)
df = df.replace({types[0]: 1, types[1]: 1})
df.isna().sum()

In [None]:
categorical_features = df.select_dtypes(include=['object']).columns
sns.countplot(df[categorical_features])
plt.show()

In [None]:
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
sns.pairplot(df[numeric_features])
plt.show()

In [None]:
# Drop columns based on visualization
to_drop = []
for col in categorical_features:
    if df[col].value_counts(normalize=True).min() < 0.1:
        to_drop.append(col)
df = df.drop(to_drop, axis=1)

In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train_scaled)
X_test_encoded = encoder.transform(X_test_scaled)

In [None]:
models = [
    ('lr', LogisticRegression()),
    ('svm', SVC()),
    ('dt', DecisionTreeClassifier()),
    ('rf', RandomForestClassifier()),
    ('xgb', XGBClassifier())
]

model_results = {}
for name, model in models:
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test_encoded)[:, 1])
    model_results[name] = {'accuracy': accuracy, 'roc_auc': roc_auc}