In [24]:
import pandas as pd

df = pd.read_csv("../data/raw/Telco-Customer-Churn.csv")
df.head()
df.info()
df['Churn'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [25]:
df.isnull().sum()


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [26]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna(subset=['TotalCharges'])

In [5]:
#df.drop(columns=['customerID'], inplace=True)

In [27]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols


['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn']

In [28]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


In [29]:
df['avg_monthly_spend'] = df['TotalCharges'] / df['tenure'].replace(0, 1)
median_charge = df['MonthlyCharges'].median()
df['is_high_value'] = (df['MonthlyCharges'] > median_charge).astype(int)
df['long_tenure_month_to_month'] = (
    (df['tenure'] > 12) & (df['Contract'] == 'Month-to-month')
).astype(int)


In [30]:
service_cols = [
    'PhoneService','MultipleLines','OnlineSecurity','OnlineBackup',
    'DeviceProtection','TechSupport','StreamingTV','StreamingMovies'
]

for col in service_cols:
    df[col] = df[col].replace({
        'No phone service': 'No',
        'No internet service': 'No'
    })


In [31]:
df['num_services'] = (df[service_cols] == 'Yes').sum(axis=1)

#Paperless + electronic payment combo (higher churn):
df['paperless_and_electronic'] = (
    (df['PaperlessBilling'] == 'Yes') &
    (df['PaymentMethod'].str.contains('electronic', case=False))
).astype(int)



In [32]:
#train test and split:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Churn'])
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
#pipeline Categorical + Numerical
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()



In [33]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)


In [34]:
#here training baseline models
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

log_reg_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

log_reg_model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [35]:
#evaluation of the base model
from sklearn.metrics import classification_report, roc_auc_score

y_pred = log_reg_model.predict(X_test)
y_prob = log_reg_model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1033
           1       0.63      0.52      0.57       374

    accuracy                           0.79      1407
   macro avg       0.73      0.71      0.72      1407
weighted avg       0.78      0.79      0.78      1407

AUC: 0.8345701476929767


In [36]:
#randomforest
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42
    ))
])

rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)
rf_prob = rf_model.predict_proba(X_test)[:,1]

print(classification_report(y_test, rf_pred))
print("AUC (Random Forest):", roc_auc_score(y_test, rf_prob))


              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.71      1407
weighted avg       0.78      0.79      0.78      1407

AUC (Random Forest): 0.8229353784988431


In [37]:
#using xgboost model
from xgboost import XGBClassifier

xgb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss"
    ))
])

xgb_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict(X_test)
xgb_prob = xgb_model.predict_proba(X_test)[:,1]

print(classification_report(y_test, xgb_pred))
print("AUC (XGBoost):", roc_auc_score(y_test, xgb_prob))


              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1033
           1       0.61      0.52      0.56       374

    accuracy                           0.79      1407
   macro avg       0.72      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407

AUC (XGBoost): 0.8310874820754668


In [38]:
#Comparison of the base, randomforest and the xgboost model accuarcy:
print("AUC Scores Comparison:")
print("Logistic Regression:", roc_auc_score(y_test, y_prob))
print("Random Forest     :", roc_auc_score(y_test, rf_prob))
print("XGBoost           :", roc_auc_score(y_test, xgb_prob))


AUC Scores Comparison:
Logistic Regression: 0.8345701476929767
Random Forest     : 0.8229353784988431
XGBoost           : 0.8310874820754668


In [39]:
df_for_pred = X.copy()
final_churn_prob = xgb_model.predict_proba(df_for_pred)[:,1]

df['churn_probability'] = final_churn_prob
df[['churn_probability']].head()


Unnamed: 0,churn_probability
0,0.727026
1,0.030406
2,0.409991
3,0.029474
4,0.610692


In [40]:
df.sort_values('churn_probability', ascending=False).head(10)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,PaymentMethod,MonthlyCharges,TotalCharges,Churn,avg_monthly_spend,is_high_value,long_tenure_month_to_month,num_services,paperless_and_electronic,churn_probability
4517,2012-NWRPA,Female,1,Yes,No,11,Yes,Yes,Fiber optic,No,...,Electronic check,99.55,1131.2,1,102.836364,1,0,5,1,0.962659
2631,6861-XWTWQ,Male,1,Yes,No,7,Yes,Yes,Fiber optic,No,...,Electronic check,99.25,665.45,1,95.064286,1,0,5,1,0.956781
2577,4910-GMJOT,Female,0,No,No,1,Yes,Yes,Fiber optic,No,...,Electronic check,94.6,94.6,1,94.6,1,0,4,1,0.946119
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Electronic check,99.65,820.5,1,102.5625,1,0,5,1,0.94491
3349,0897-FEGMU,Female,0,Yes,No,11,Yes,Yes,Fiber optic,No,...,Electronic check,99.5,1056.95,1,96.086364,1,0,5,1,0.944446
2208,7216-EWTRS,Female,1,Yes,No,1,Yes,Yes,Fiber optic,No,...,Electronic check,100.8,100.8,1,100.8,1,0,5,1,0.94142
3380,5178-LMXOP,Male,1,Yes,No,1,Yes,Yes,Fiber optic,No,...,Electronic check,95.1,95.1,1,95.1,1,0,4,1,0.940677
3209,8149-RSOUN,Female,0,No,No,1,Yes,Yes,Fiber optic,No,...,Electronic check,93.85,93.85,1,93.85,1,0,4,1,0.940089
6866,0295-PPHDO,Male,0,No,No,1,Yes,Yes,Fiber optic,No,...,Electronic check,95.45,95.45,1,95.45,1,0,4,1,0.937393
4800,9300-AGZNL,Male,1,No,No,1,Yes,Yes,Fiber optic,No,...,Electronic check,94.0,94.0,1,94.0,1,0,4,1,0.935743


In [41]:
#Recommendation logic
def recommend_action(row):
    p = row['churn_probability']
    num_services = row['num_services']
    high_value = row['is_high_value']
    contract = row['Contract']
    long_tenure_mtm = row['long_tenure_month_to_month']
    
    # Low risk
    if p < 0.30:
        return "No action needed — send engagement email."
    
    # High risk customers
    if p >= 0.70:
        if high_value:
            return "Offer loyalty discount + personal support call."
        if long_tenure_mtm:
            return "Offer upgrade from month-to-month to annual plan."
        if num_services <= 2:
            return "Cross-sell a bundle (internet + TV/streaming)."
        return "Call customer and provide retention offer."
    
    # Medium risk
    if 0.30 <= p < 0.70:
        if contract == "Month-to-month":
            return "Send discount for 12-month contract."
        if num_services >= 4:
            return "Promote premium support & benefits."
        return "Send satisfaction survey + small voucher."


In [42]:
df['recommended_action'] = df.apply(recommend_action, axis=1)

df[['churn_probability', 'recommended_action']].head(10)

Unnamed: 0,churn_probability,recommended_action
0,0.727026,Cross-sell a bundle (internet + TV/streaming).
1,0.030406,No action needed — send engagement email.
2,0.409991,Send discount for 12-month contract.
3,0.029474,No action needed — send engagement email.
4,0.610692,Send discount for 12-month contract.
5,0.94491,Offer loyalty discount + personal support call.
6,0.349959,Send discount for 12-month contract.
7,0.222585,No action needed — send engagement email.
8,0.768946,Offer loyalty discount + personal support call.
9,0.014099,No action needed — send engagement email.


In [43]:
def risk_bucket(p):
    if p < 0.30:
        return "Low"
    elif p < 0.70:
        return "Medium"
    else:
        return "High"

df['risk_bucket'] = df['churn_probability'].apply(risk_bucket)




In [45]:
df['RevenueAtRisk'] = df['MonthlyCharges'] * df['churn_probability']

df.to_csv("churn_predictions_with_actions.csv", index=False)