In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.combine import SMOTEENN
from catboost import CatBoostClassifier

# Load the dataset
original_df = pd.read_csv("C:/Users/abhir/Downloads/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Preprocess the data
# Convert 'TotalCharges' to numeric, forcing errors to NaN
original_df['TotalCharges'] = pd.to_numeric(original_df['TotalCharges'], errors='coerce')
original_df['TotalCharges'].fillna(original_df['TotalCharges'].median(), inplace=True)

# Encode the target 'Churn' (binary classification: Yes/No)
label_encoder = LabelEncoder()
original_df['Churn'] = label_encoder.fit_transform(original_df['Churn'])

# Separate features (X) and target (y)
X = original_df[['MonthlyCharges', 'tenure', 'TotalCharges']].copy()
y = original_df['Churn']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTEENN to handle class imbalance
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X_scaled, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# ----------- CatBoost Model with Hyperparameter Tuning -----------
catboost_model = CatBoostClassifier(verbose=0, random_seed=42)

# Define the parameter grid for CatBoost
param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200, 300],
    'l2_leaf_reg': [1, 3, 5, 7]
}

# Use RandomizedSearchCV to find the best parameters
catboost_search = RandomizedSearchCV(estimator=catboost_model, param_distributions=param_grid, 
                                     n_iter=10, cv=3, scoring='accuracy', random_state=42, n_jobs=-1)
catboost_search.fit(X_train, y_train)
best_catboost = catboost_search.best_estimator_

# Predict on the test set
y_pred = best_catboost.predict(X_test)

# Evaluate the performance of the CatBoost model
print("CatBoost Model Classification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"CatBoost Model Accuracy: {accuracy * 100:.2f}%")
import joblib

# Save the model
joblib.dump(best_catboost, 'catboost_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  original_df['TotalCharges'].fillna(original_df['TotalCharges'].median(), inplace=True)


CatBoost Model Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       580
           1       0.95      0.97      0.96       604

    accuracy                           0.96      1184
   macro avg       0.96      0.96      0.96      1184
weighted avg       0.96      0.96      0.96      1184

CatBoost Model Accuracy: 95.78%


['label_encoder.pkl']