In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import joblib

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("/content/telecom_customers_synthetic_balanced_download.csv")

# -----------------------------
# 2. Drop irrelevant/leaky columns
# -----------------------------
drop_cols = [
    "name", "payment_link", "support_number",
    "due_date", "suspension_date", "churned"  # leakage / irrelevant
]
# keep customer_id in df for lookup but not for modeling
df_model = df.drop(columns=drop_cols)

# -----------------------------
# 3. Separate X / y and encode y
# -----------------------------
y_text = df_model["label"].astype(str)
X = df_model.drop(columns=["label", "customer_id"])

# Label encode textual classes -> integers
le = LabelEncoder()
y = le.fit_transform(y_text)  # eg. 0,1,2 mapping to classes in le.classes_

print("Label mapping:", dict(enumerate(le.classes_)))  # to see mapping

# -----------------------------
# 4. Preprocess features
# -----------------------------
# numeric columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
# categorical columns
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])


# -----------------------------
# 5. Train/test split (80/20)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# -----------------------------
# 6. XGBoost model
# -----------------------------
num_classes = len(np.unique(y))
model = xgb.XGBClassifier(
    objective="multi:softprob",
    eval_metric="mlogloss",
    num_class=num_classes,
    learning_rate=0.08,
    max_depth=5,
    n_estimators=300,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    random_state=42
)

# Pipeline = preprocessing + model
clf = Pipeline([
    ("preprocess", preprocessor),
    ("xgb", model)
])

# Train
clf.fit(X_train, y_train)

# -----------------------------
# 7. Evaluation
# -----------------------------
y_pred = clf.predict(X_test)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# -----------------------------
# 8. Save model + label encoder
# -----------------------------
joblib.dump(clf, "xgboost_telecom_pipeline.joblib")
joblib.dump(le, "label_encoder.joblib")
print("Saved: xgboost_telecom_pipeline.joblib and label_encoder.joblib")


Label mapping: {0: 'defector', 1: 'normal', 2: 'premium'}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification Report:

              precision    recall  f1-score   support

    defector       1.00      1.00      1.00        80
      normal       1.00      1.00      1.00       200
     premium       1.00      1.00      1.00       120

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400


Confusion Matrix:

[[ 80   0   0]
 [  0 200   0]
 [  0   0 120]]
Saved: xgboost_telecom_pipeline.joblib and label_encoder.joblib


In [4]:
import pandas as pd
import joblib
import numpy as np

# Load saved pipeline + encoder + original dataset (for lookup)
clf = joblib.load("xgboost_telecom_pipeline.joblib")
le = joblib.load("label_encoder.joblib")
df = pd.read_csv("/content/telecom_customers_synthetic_balanced_download.csv")

def predict_customer_class(customer_id):
    """
    Input: customer_id (string)
    Output: predicted class as string -> 'premium', 'normal', or 'defector'
    """
    # find the row
    row = df[df['customer_id'] == customer_id]
    if row.empty:
        return f"Customer ID {customer_id} not found."

    # Prepare model input: drop leakage columns and label/customer_id
    drop_cols = ["name", "payment_link", "support_number",
                 "due_date", "suspension_date", "churned", "label"]
    # remove columns if they don't exist (robustness)
    for c in drop_cols:
        if c in row.columns:
            row = row.drop(columns=[c])
    if "customer_id" in row.columns:
        row = row.drop(columns=["customer_id"])

    # Predict numeric class and map back to text
    numeric_pred = clf.predict(row)[0]            # e.g. 0,1,2
    text_pred = le.inverse_transform([numeric_pred])[0]
    # Also get probabilities if you want
    probs = clf.predict_proba(row)[0]  # shape (num_classes,)
    prob_dict = {cls: float(probs[i]) for i, cls in enumerate(le.classes_)}

    return {"customer_id": customer_id, "predicted_class": text_pred, "probabilities": prob_dict}

# Example
print(predict_customer_class("CUST00010"))


{'customer_id': 'CUST00010', 'predicted_class': 'defector', 'probabilities': {'defector': 0.9970023036003113, 'normal': 0.0026157107204198837, 'premium': 0.00038198556285351515}}
