In [1]:
#import statements
import pandas as pd

#### 1. Problem framing — target, metric, baseline you’re beating.

"Predict behavior to retain customers. You can analyze all relevant customer data and develop focused customer retention programs."

**I am going to find a correlation plot for all variables to figure out which attributes are most correlated with churn (yes/no). From there, I'll develop focused customer retention programs based on which demographics have high churn, which services the churn group have, and customer account information.**

#### 2. Data hygiene — types, missingness, outliers, leakage check; document all fixes.

In [2]:
filepath = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(filepath)
df.dtypes # everything categorical except SeniorCitizen, tenure, and MonthlyCharges are numerical

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
# need to turn TotalCharges into numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'] , errors = 'coerce')

In [None]:
df.dtypes # fixed - TotalCharges now float

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [38]:
df.info() # no null or missing values!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [39]:
df.nunique()

customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6530
Churn                  2
dtype: int64

#### 3. EDA — 3–4 plots max; tie each to a hypothesis.

In [41]:
# categorical vars plots

num_vars = df.select_dtypes(include = 'number').columns.tolist()
cat_vars = df.select_dtypes(exclude = 'number').columns.tolist()
cat_vars.remove('customerID')

In [42]:
for col in cat_vars:
    print(f"\n{col}")
    display(df[col].value_counts(normalize=True).round(3))


gender


Male      0.505
Female    0.495
Name: gender, dtype: float64


Partner


No     0.517
Yes    0.483
Name: Partner, dtype: float64


Dependents


No     0.7
Yes    0.3
Name: Dependents, dtype: float64


PhoneService


Yes    0.903
No     0.097
Name: PhoneService, dtype: float64


MultipleLines


No                  0.481
Yes                 0.422
No phone service    0.097
Name: MultipleLines, dtype: float64


InternetService


Fiber optic    0.440
DSL            0.344
No             0.217
Name: InternetService, dtype: float64


OnlineSecurity


No                     0.497
Yes                    0.287
No internet service    0.217
Name: OnlineSecurity, dtype: float64


OnlineBackup


No                     0.438
Yes                    0.345
No internet service    0.217
Name: OnlineBackup, dtype: float64


DeviceProtection


No                     0.439
Yes                    0.344
No internet service    0.217
Name: DeviceProtection, dtype: float64


TechSupport


No                     0.493
Yes                    0.290
No internet service    0.217
Name: TechSupport, dtype: float64


StreamingTV


No                     0.399
Yes                    0.384
No internet service    0.217
Name: StreamingTV, dtype: float64


StreamingMovies


No                     0.395
Yes                    0.388
No internet service    0.217
Name: StreamingMovies, dtype: float64


Contract


Month-to-month    0.550
Two year          0.241
One year          0.209
Name: Contract, dtype: float64


PaperlessBilling


Yes    0.592
No     0.408
Name: PaperlessBilling, dtype: float64


PaymentMethod


Electronic check             0.336
Mailed check                 0.229
Bank transfer (automatic)    0.219
Credit card (automatic)      0.216
Name: PaymentMethod, dtype: float64


Churn


No     0.735
Yes    0.265
Name: Churn, dtype: float64

In [None]:
# numerical vars plots

[insert verdict on whether outliers or not]
[insert insights on what we notice about category distribution]

In [None]:
# plot heat map for all variables

xxx

[insights on what are strongly correlated with churn --> think of some strategies]

# good stopping point for tonight --> look into code for running models etc.

#### 4. Baseline — simple logistic/linear or heuristic.

#### 5. Models — add RF/GBM (SVM if small); justify choices.

#### 6. Validation — proper split/CV; report ROC-AUC (class) or MAE/RMSE (reg).

#### 7. Explainability — feature importance + 2–3 “so-what” insights.

#### 8. Business recs — 3 insights, 2 actions, risks, next data.

#### 9. Reproducibility — README (how to run), requirements, clear section headers.

#### 10. Communication — concise slides: Problem/Data → Baseline vs Model → Drivers → Recs.

In [None]:
df.