# Churn Prediction


Install packages


In [1]:
!uv pip install -q \
    python-dotenv==1.2.1 \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.2 \
    matplotlib==3.10.6 \
    seaborn==0.13.2 \
    scikit-learn==1.7.1 \
    requests==2.32.5

Append notebooks directory to sys.path


In [None]:
import sys

sys.path.append("../../../..")

Import packages


In [None]:
import os
import pathlib
import pickle

import pandas as pd
import requests
from dotenv import load_dotenv
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from notebooks.python.utils.data_extraction.data_extraction import (
    KaggleDataExtractor,
    KaggleExtractionConfig,
)

pd.set_option("display.max_columns", None)

load_dotenv()  # Root directory .env file

True

## Utility scripts:

**KaggleDataExtractor**:

```py
--8<-- "docs/notebooks/python/utils/data_extraction/data_extraction.py"
```


Create data directory


In [None]:
BASE_PATH = pathlib.Path("../../machine-learning")
DATA_DIR = BASE_PATH / "data/predicting-customer-churn"
OUTPUT_DIR = BASE_PATH / "artifacts/predicting-customer-churn"

DATA_DIR.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)

Download dataset from Kaggle


In [None]:
username = os.getenv("KAGGLE_USERNAME")
api_token = os.getenv("KAGGLE_API_TOKEN")
file_name = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

extractor = KaggleDataExtractor(username=username, api_token=api_token)

config = KaggleExtractionConfig(
    dataset_slug="blastchar/telco-customer-churn",
    file_name=file_name,
    destination_path=DATA_DIR,
    output_file_name="churn.csv",
)

if not os.path.isfile(DATA_DIR / "churn.csv"):
    extractor.download_dataset(config)

## Preprocess Data


In [None]:
df = pd.read_csv(DATA_DIR / "churn.csv")

df.columns = df.columns.str.lower().str.replace(" ", "_")

categorical_columns = list(df.dtypes[df.dtypes == "object"].index)

for column in categorical_columns:
    df[column] = df[column].str.lower().str.replace(" ", "_")

df.totalcharges = pd.to_numeric(df.totalcharges, errors="coerce")
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == "yes").astype(int)

Set target


In [None]:
y_train = df.churn

Set features


In [None]:
numerical = ["tenure", "monthlycharges", "totalcharges"]

categorical = [
    "gender",
    "seniorcitizen",
    "partner",
    "dependents",
    "phoneservice",
    "multiplelines",
    "internetservice",
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
    "contract",
    "paperlessbilling",
    "paymentmethod",
]

One-hot encoding


In [None]:
dv = DictVectorizer()

train_dict = df[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dict)

model = LogisticRegression(solver="liblinear")
model.fit(X_train, y_train)

print(model)

LogisticRegression(solver='liblinear')


Mock datapoint for testing


In [None]:
datapoint = {
    "gender": "female",
    "seniorcitizen": 0,
    "partner": "yes",
    "dependents": "no",
    "phoneservice": "no",
    "multiplelines": "no_phone_service",
    "internetservice": "dsl",
    "onlinesecurity": "no",
    "onlinebackup": "yes",
    "deviceprotection": "no",
    "techsupport": "no",
    "streamingtv": "no",
    "streamingmovies": "no",
    "contract": "month-to-month",
    "paperlessbilling": "yes",
    "paymentmethod": "electronic_check",
    "tenure": 1,
    "monthlycharges": 29.85,
    "totalcharges": 29.85,
}

Transform mocked datapoint


In [None]:
X = dv.transform(datapoint)

Predict value


In [None]:
model.predict_proba(X)[0, 1]

np.float64(0.6638167617162171)

Serialize and save model


In [None]:
with open(OUTPUT_DIR / "model.bin", "wb") as f_out:
    pickle.dump((dv, model), f_out)

Load model


In [None]:
with open(OUTPUT_DIR / "model.bin", "rb") as f_in:
    (dv, model) = pickle.load(f_in)

Create a prediction pipeline


In [None]:
pipeline = make_pipeline(
    DictVectorizer(), LogisticRegression(solver="liblinear")
)

Predict


In [None]:
pipeline.fit(train_dict, y_train)
pipeline.predict_proba(datapoint)[0, 1]

np.float64(0.6638167617162171)