In [4]:
# --- 1. Import Necessary Libraries ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import joblib

print("--- Libraries Imported ---")

# --- 2. Load and COMBINE Data ---
# Combine the training and testing datasets to create a single, unified dataset.
# This ensures that our final train/test split comes from the same data distribution.
train_df = pd.read_csv('customer_churn_dataset-training-master.csv')
test_df_original = pd.read_csv('customer_churn_dataset-testing-master.csv')
df = pd.concat([train_df, test_df_original], ignore_index=True)


# --- 3. Prepare the Combined Data ---
# Drop CustomerID as it's an identifier, not a predictive feature
if 'CustomerID' in df.columns:
    df = df.drop('CustomerID', axis=1)

# Ensure there are no missing values in the target before proceeding
if df['Churn'].isnull().any():
    df.dropna(subset=['Churn'], inplace=True)

print("\n--- All Data Loaded, Combined, and Cleaned ---")

# --- 4. FIT Encoders on Combined Data & Save ---
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}

print("\n--- Encoding Categorical Features ---")
for col in categorical_cols:
    le = LabelEncoder()
    # Fit and transform the entire dataset
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le # Save the fitted encoder
    print(f"'{col}' column has been encoded.")

# Save the fitted encoders to a file for the app to use
joblib.dump(label_encoders, 'label_encoders.pkl')
print("\nLabel Encoders saved to 'label_encoders.pkl'")

# --- 5. Define Features (X) and Target (y) & Save Column Order ---
X = df.drop(columns=['Churn'])
y = df['Churn'].astype(int)

# Save the column order for the app
model_columns = X.columns.tolist()
joblib.dump(model_columns, 'model_columns.pkl')
print("Model columns saved to 'model_columns.pkl'")

# --- 6. Split the COMBINED Data into New Train and Test Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nData successfully split into new training ({X_train.shape[0]} rows) and testing ({X_test.shape[0]} rows) sets.")


# --- 7. FIT Scaler on the NEW Training Data & Save ---
scaler = StandardScaler()

# Fit the scaler ONLY on the new training data
X_train_scaled = scaler.fit_transform(X_train)
# Apply the same transformation to the new test data
X_test_scaled = scaler.transform(X_test)

# Save the fitted scaler for the app
joblib.dump(scaler, 'scaler.pkl')
print("\nScaler saved to 'scaler.pkl'")

# --- 8. Train the XGBoost Model with Class Imbalance Handling ---
# Calculate the scale_pos_weight to handle imbalance in the new training set
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"\n--- Handling Class Imbalance with scale_pos_weight: {scale_pos_weight:.2f} ---")

xgb_clf = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)
xgb_clf.fit(X_train_scaled, y_train)
print("\n--- XGBoost Model Training Complete ---")


# --- 9. Evaluate the Model on the NEW Test Set ---
y_pred = xgb_clf.predict(X_test_scaled)
y_proba = xgb_clf.predict_proba(X_test_scaled)[:, 1]

print("\n\n--- MODEL PERFORMANCE EVALUATION ---")
print("\nClassification Report:")
# The scores should now be much more balanced and realistic
print(classification_report(y_test, y_pred))

print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
print("\n--- Evaluation Complete ---")


# --- 10. Save the Trained Model ---
joblib.dump(xgb_clf, 'churn_model.pkl')
print("\nTrained model saved to 'churn_model.pkl'")
print("\n✅ All necessary files have been created successfully!")


--- Libraries Imported ---

--- All Data Loaded, Combined, and Cleaned ---

--- Encoding Categorical Features ---
'Gender' column has been encoded.
'Subscription Type' column has been encoded.
'Contract Length' column has been encoded.

Label Encoders saved to 'label_encoders.pkl'
Model columns saved to 'model_columns.pkl'

Data successfully split into new training (404164 rows) and testing (101042 rows) sets.

Scaler saved to 'scaler.pkl'

--- Handling Class Imbalance with scale_pos_weight: 0.80 ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- XGBoost Model Training Complete ---


--- MODEL PERFORMANCE EVALUATION ---

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.86      0.92     44943
           1       0.90      0.99      0.94     56099

    accuracy                           0.93    101042
   macro avg       0.94      0.92      0.93    101042
weighted avg       0.94      0.93      0.93    101042

Overall Accuracy: 0.9320
ROC AUC Score: 0.9530

--- Evaluation Complete ---

Trained model saved to 'churn_model.pkl'

✅ All necessary files have been created successfully!
