# Logistic Regression Tutorial
We train and evaluate a logistic regression classifier on the Kaggle Telecom Customer Churn dataset, covering data acquisition, preprocessing, model fitting, and evaluation.

## 1. Load Libraries
We gather tools for data handling, Kaggle downloads, preprocessing pipelines, and classification metrics.

In [1]:
# Import essential libraries
import numpy as np
import pandas as pd
from pathlib import Path
import kagglehub
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## 2. Load Dataset
We download the Telecom Customer Churn dataset from Kaggle, focus on the churn label plus ten service-related attributes, clean the records, and split the data for training and testing.

In [None]:
# Download Kaggle dataset and prepare features
path = kagglehub.dataset_download("barun2104/telecom-churn")
csv_candidates = sorted(Path(path).glob("**/*.csv"))
if not csv_candidates:
    raise FileNotFoundError("No CSV files were found in the downloaded Kaggle dataset.")

expected_columns = {
    "Churn",
    "AccountWeeks",
    "ContractRenewal",
    "DataPlan",
    "DataUsage",
    "CustServCalls",
    "DayMins",
    "DayCalls",
    "MonthlyCharge",
    "OverageFee",
    "RoamMins",
}
csv_path = None
for candidate in csv_candidates:
    preview = pd.read_csv(candidate, nrows=5)
    preview.columns = preview.columns.str.strip()
    if expected_columns.issubset(set(preview.columns)):
        csv_path = candidate
        break

if csv_path is None:
    available = {tuple(pd.read_csv(c, nrows=1).columns) for c in csv_candidates}
    raise ValueError(f"Unable to locate a CSV with required columns. Columns seen: {available}")

print("Using dataset:", csv_path.name)
print("Path to dataset files:", path)

churn = pd.read_csv(csv_path)
churn.columns = churn.columns.str.strip()

missing_columns = expected_columns - set(churn.columns)
if missing_columns:
    raise ValueError(f"Dataset is missing expected columns: {missing_columns}")

churn = churn[list(expected_columns)].copy()

numeric_features = [
    "AccountWeeks",
    "ContractRenewal",
    "DataPlan",
    "DataUsage",
    "CustServCalls",
    "DayMins",
    "DayCalls",
    "MonthlyCharge",
    "OverageFee",
    "RoamMins",
]
for col in numeric_features:
    churn[col] = pd.to_numeric(churn[col], errors="coerce")

target_col = "Churn"
if churn[target_col].dtype == object:
    churn[target_col] = churn[target_col].astype(str).str.strip()
    churn[target_col] = churn[target_col].map({
        "Yes": 1,
        "No": 0,
        "1": 1,
        "0": 0,
    })

if churn[target_col].dtype != np.int64 and churn[target_col].dtype != np.int32:
    churn[target_col] = pd.to_numeric(churn[target_col], errors="coerce")

churn = churn.dropna(subset=[target_col] + numeric_features)
churn[target_col] = churn[target_col].astype(int)

numeric_defaults = {col: float(churn[col].median()) for col in numeric_features}

X = churn[numeric_features].copy()
y = churn[target_col].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)
X_train.shape, X_test.shape

ValueError: Unable to locate a CSV with required columns. Columns seen: {('Churn', 'AccountWeeks', 'ContractRenewal', 'DataPlan', 'DataUsage', 'CustServCalls', 'DayMins', 'DayCalls', 'MonthlyCharge', 'OverageFee', 'RoamMins')}

## 3. Build and Train Model
We assemble a preprocessing pipeline that scales numeric features, one-hot encodes categoricals, and fits logistic regression.

In [None]:
# Build preprocessing pipeline and fit logistic regression
clf = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression(max_iter=1000)),
    ]
)
clf.fit(X_train, y_train)
clf

## 4. Evaluate Performance
We predict labels and summarize classification accuracy and per-class metrics.

In [None]:
# Evaluate predictions with accuracy and detailed report
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=["No churn", "Churn"])
acc, report

## 5. Try It Yourself
Provide customer attributes to see whether the model predicts churn and inspect the churn probability.

In [None]:
# Collect user-provided feature values and predict churn
def ask_numeric(name: str, default: float) -> float:
    user_input = input(f"{name} [{default}]: ").strip()
    if not user_input:
        return default
    try:
        return float(user_input)
    except ValueError:
        print("Invalid number entered. Using default value.")
        return default

user_inputs = {feature: ask_numeric(feature, numeric_defaults[feature]) for feature in numeric_features}

user_df = pd.DataFrame([user_inputs])
pred_label = clf.predict(user_df)[0]
churn_probability = clf.predict_proba(user_df)[0, 1]
label_text = "Churn" if pred_label == 1 else "No churn"
print(f"Predicted outcome: {label_text} (churn probability: {churn_probability:.2%})")