<h1><b>TELECOM CHURN PREDICTION</b></h1>

<H2>IMPORTING USEFUL LIBRARIES</H2>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA

<H2>LOADING DATASET</H2>

In [2]:
df = pd.read_csv("Telco-Customer-Churn.csv")
pd.set_option("display.max_columns", None)
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


<H2>DATA PREPROCESSING</H2>

In [3]:
# Checking for missing value
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

<p>SO, NO missing values found.</p>

In [4]:
# maximum value of tenure
df['tenure'].max()

72

<p>PROCESSING TENURE COLUMN</p>

In [5]:
def cohort(tenure):
    if tenure <= 12:
        return 0
    elif tenure <= 24:
        return 1
    elif tenure <= 36:
        return 2
    elif tenure <= 48:
        return 3
    elif tenure <= 60:
        return 4
    elif tenure <= 72:
        return 5
    else:
        return 6
    
#  It apply a function to every data element of a column
df['tenure'] = df['tenure'].apply(cohort)
df.head(4)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,0,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,2,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,0,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,3,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No


SETTING FEATURE AND TARGET COLUMNS

In [6]:
X = df.drop(['Churn', 'customerID'],axis=1)

# Converting target categorical column into numerical  
df['Churn'] = df['Churn'].map( {'Yes':1 ,'No':0})

y = df['Churn']
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,0,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,2,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,0,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,3,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,0,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


<H2>TRANSFORMATION</H2>

In [7]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

<H2>STACKING BASE CLASSIFIER MODELS</H2>

In [8]:
# Define stacking classifier base learners
knn = KNeighborsClassifier()
svc = SVC(probability=True)
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()
abc = AdaBoostClassifier()
lr = LogisticRegression()

# Define stacking classifier
stacking_estimators = [('knn', knn), ('svc', svc), ('rfc', rfc), ('gbc', gbc), ('abc', abc)]
stacking_classifier = StackingClassifier(estimators=stacking_estimators, final_estimator=lr)

<h2>PIPELINE UPDATE</h2>

In [9]:
# Update pipeline
stacking_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('feature_selection', SelectFromModel(RandomForestClassifier())),
    ('classifier', stacking_classifier)
])

<h2>HYPERPARAMETER TUNING</h2>

In [10]:
# Hyperparameter tuning
param_grid = {
    "classifier__knn__n_neighbors": [5, 11],
    "classifier__svc__C": [0.1, 0.4, 1],
    "classifier__svc__kernel": ['poly', 'rbf'],
    "classifier__svc__degree": [2, 3],
    "classifier__rfc__n_estimators": [64, 128],
    "classifier__rfc__max_depth": [None, 8, 12],    
    "classifier__rfc__min_samples_split": [15, 20, 30],
    "classifier__gbc__learning_rate": [0.2, 0.4, 0.6],
    "classifier__gbc__max_features": [8, 12],
    "classifier__gbc__n_estimators": [128, 256],
    "classifier__gbc__min_samples_split": [15, 25, 30],
    "classifier__gbc__max_depth": [None, 7, 10],
    "feature_selection__threshold": ['mean', 'median', '1.25*median'],
    "classifier__final_estimator__C": [0.1, 0.5, 1.0],
    "classifier__final_estimator__solver": ['liblinear', 'saga'],
    "classifier__final_estimator__penalty": ['l1', 'l2']
}

<h2>RANDOMISED SEARCH OBJECT</h2>

In [11]:
# Create RandomizedSearchCV object
search = RandomizedSearchCV(stacking_pipeline, param_grid, n_jobs=-1, n_iter=2, scoring='accuracy', cv=5)

<h2>SPLITTING DATA INTO TRAIN AND TEST SETS</h2>

In [12]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

<h3>PERFORMING HYPERPARAMETER TUNING</h3>

In [13]:
# Perform hyperparameter tuning
search.fit(X_train, y_train)



<h3>BEST PARAMETER</h3>

In [14]:
# Print best parameters and scores
print("Best Parameters:")
print(search.best_params_)
print("Best Scores:")
print(search.best_score_)

Best Parameters:
{'feature_selection__threshold': 'mean', 'classifier__svc__kernel': 'poly', 'classifier__svc__degree': 3, 'classifier__svc__C': 1, 'classifier__rfc__n_estimators': 128, 'classifier__rfc__min_samples_split': 30, 'classifier__rfc__max_depth': None, 'classifier__knn__n_neighbors': 11, 'classifier__gbc__n_estimators': 256, 'classifier__gbc__min_samples_split': 25, 'classifier__gbc__max_features': 12, 'classifier__gbc__max_depth': 10, 'classifier__gbc__learning_rate': 0.4, 'classifier__final_estimator__solver': 'saga', 'classifier__final_estimator__penalty': 'l1', 'classifier__final_estimator__C': 0.5}
Best Scores:
0.7779555555555555


<h3>EVALUATION ON TEST SET</h3>

In [15]:
# Evaluate on test set
y_pred = search.predict(X_test)
print("Test Set Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Test Set Performance:
Accuracy: 0.7853589196872779
Precision: 0.5664160401002506
Recall: 0.6366197183098592
