In [1]:
import pandas as pd
import numpy as np

PATH = "../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(PATH)

df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")


In [2]:
def make_note(row):
    notes = []

    if row["tenure"] <= 6:
        notes.append("New customer still evaluating the service.")
    elif row["tenure"] >= 48:
        notes.append("Long-term customer with stable history.")

    contract = row.get("Contract", "")
    if contract == "Month-to-month":
        notes.append("Month-to-month plan; higher sensitivity to dissatisfaction.")
    elif contract == "Two year":
        notes.append("Two-year contract; lower churn tendency.")

    mc = row.get("MonthlyCharges", 0)
    if mc >= 90:
        notes.append("Customer mentions high monthly cost concerns.")
    elif mc <= 30:
        notes.append("Low monthly cost; fewer billing complaints.")

    pm = str(row.get("PaymentMethod", "")).lower()
    if "electronic check" in pm:
        notes.append("Payment via electronic check; occasional billing friction noted.")

    if row.get("InternetService", "") == "Fiber optic":
        notes.append("Fiber service; intermittent performance complaints reported.")

    if row.get("TechSupport", "") == "No":
        notes.append("No tech support; support resolution concerns mentioned.")

    if not notes:
        notes.append("No major issues reported recently.")

    return " ".join(notes)

df["support_notes"] = df.apply(make_note, axis=1)
df[["tenure","Contract","MonthlyCharges","PaymentMethod","InternetService","TechSupport","support_notes"]].head(10)


Unnamed: 0,tenure,Contract,MonthlyCharges,PaymentMethod,InternetService,TechSupport,support_notes
0,1,Month-to-month,29.85,Electronic check,DSL,No,New customer still evaluating the service. Mon...
1,34,One year,56.95,Mailed check,DSL,No,No tech support; support resolution concerns m...
2,2,Month-to-month,53.85,Mailed check,DSL,No,New customer still evaluating the service. Mon...
3,45,One year,42.3,Bank transfer (automatic),DSL,Yes,No major issues reported recently.
4,2,Month-to-month,70.7,Electronic check,Fiber optic,No,New customer still evaluating the service. Mon...
5,8,Month-to-month,99.65,Electronic check,Fiber optic,No,Month-to-month plan; higher sensitivity to dis...
6,22,Month-to-month,89.1,Credit card (automatic),Fiber optic,No,Month-to-month plan; higher sensitivity to dis...
7,10,Month-to-month,29.75,Mailed check,DSL,No,Month-to-month plan; higher sensitivity to dis...
8,28,Month-to-month,104.8,Electronic check,Fiber optic,Yes,Month-to-month plan; higher sensitivity to dis...
9,62,One year,56.15,Bank transfer (automatic),DSL,No,Long-term customer with stable history. No tec...


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

y = df["Churn"].astype(int)

X_struct = df.drop(columns=["customerID", "Churn", "support_notes"])

num_cols = X_struct.select_dtypes(include=["number"]).columns.tolist()
cat_cols = [c for c in X_struct.columns if c not in num_cols]

preprocess_struct = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
        ]), cat_cols),
    ]
)

X_struct_processed = preprocess_struct.fit_transform(X_struct)
X_struct_processed.shape


(7043, 45)

In [None]:
from sentence_transformers import SentenceTransformer

text_model = SentenceTransformer("all-MiniLM-L6-v2")

text_embeddings = text_model.encode(
    df["support_notes"].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

text_embeddings.shape


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import os, joblib

X_combined = np.hstack([X_struct_processed, text_embeddings])

os.makedirs("../data/processed", exist_ok=True)
os.makedirs("../models", exist_ok=True)

np.save("../data/processed/X.npy", X_combined)
np.save("../data/processed/y.npy", y.to_numpy())

joblib.dump(preprocess_struct, "../data/processed/preprocess_struct.joblib")
joblib.dump(text_model, "../data/processed/text_model.joblib")

print("Saved X/y and preprocessors.")
print("X shape:", X_combined.shape)
