In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Customer-Churn.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)

In [6]:
print(df['TotalCharges'].head())
print(df['TotalCharges'].isnull().sum())

0      29.85
1    1889.50
2     108.15
3    1840.75
4     151.65
Name: TotalCharges, dtype: float64
0


In [7]:
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [8]:
df['Churn'].head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [9]:
df['Churn'].unique()

array([0, 1])

In [10]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [11]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['customerID','Churn'])
y = df.Churn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5634, 19)
(1409, 19)
(5634,)
(1409,)


In [12]:
categorical_col = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
numerical_col = ['tenure', 'MonthlyCharges', 'TotalCharges']


In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
X_train_cat = X_train[categorical_col]
X_train_num = X_train[numerical_col]

X_test_cat = X_test[categorical_col]
X_test_num = X_test[numerical_col]

#standardscaler
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns=numerical_col)
X_test_num_scaled = pd.DataFrame(X_test_num_scaled, columns=numerical_col)

#onehotencoder
encoder = OneHotEncoder(sparse_output=False)
X_train_cat_encoded = encoder.fit_transform(X_train_cat)
X_test_cat_encoded = encoder.transform(X_test_cat)

encoded_cat_columns = encoder.get_feature_names_out(categorical_col)

X_train_cat_encoded = pd.DataFrame(X_train_cat_encoded, columns=encoded_cat_columns)
X_test_cat_encoded = pd.DataFrame(X_test_cat_encoded, columns=encoded_cat_columns)

#concat numerical and categorical datasets
X_train_processed = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis=1)
X_test_processed = pd.concat([X_test_num_scaled, X_test_cat_encoded], axis=1)

print(X_train_processed.shape, X_test_processed.shape)

(5634, 46) (1409, 46)


In [14]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import lightgbm as lgb

rf_model = RandomForestClassifier(random_state=1)
et_model = ExtraTreesClassifier(random_state=1)
xgb_model = xgb.XGBClassifier(random_state=1, use_label_encoder=False, eval_metric='logloss')
lgb_model = lgb.LGBMClassifier(random_state=1)

# Train Random Forest
rf_model.fit(X_train_processed, y_train)
rf_pred = rf_model.predict(X_test_processed)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_cm = confusion_matrix(y_test, rf_pred)
print("Random Forest Classifier:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(classification_report(y_test, rf_pred))
print(f"rf_cm: {rf_cm}")

# Train Extra Trees model
et_model.fit(X_train_processed, y_train)
et_pred = et_model.predict(X_test_processed)
et_accuracy = accuracy_score(y_test, et_pred)
et_cm = confusion_matrix(y_test, et_pred)
print("Extra Trees Classifier:")
print(f"Accuracy: {et_accuracy:.4f}")
print(classification_report(y_test, et_pred))
print(f"et_cm: {et_cm}")

# Train XGBoost model
xgb_model.fit(X_train_processed, y_train)
xgb_pred = xgb_model.predict(X_test_processed)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_cm = confusion_matrix(y_test, xgb_pred)
print("XGBoost Classifier:")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(classification_report(y_test, xgb_pred))
print(f"xgb_cm: {xgb_cm}")

# Train LightGBM model
lgb_model.fit(X_train_processed, y_train)
lgb_pred = lgb_model.predict(X_test_processed)
lgb_accuracy = accuracy_score(y_test, lgb_pred)
lgb_cm = confusion_matrix(y_test, lgb_pred)
print("LightGBM Classifier:")
print(f"Accuracy: {lgb_accuracy:.4f}")
print(classification_report(y_test, lgb_pred))
print(f"lgb_cm: {lgb_cm}")

Random Forest Classifier:
Accuracy: 0.7913
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1061
           1       0.58      0.53      0.56       348

    accuracy                           0.79      1409
   macro avg       0.72      0.71      0.71      1409
weighted avg       0.79      0.79      0.79      1409

rf_cm: [[929 132]
 [162 186]]
Extra Trees Classifier:
Accuracy: 0.7672
              precision    recall  f1-score   support

           0       0.83      0.86      0.85      1061
           1       0.53      0.47      0.50       348

    accuracy                           0.77      1409
   macro avg       0.68      0.67      0.67      1409
weighted avg       0.76      0.77      0.76      1409

et_cm: [[916 145]
 [183 165]]
XGBoost Classifier:
Accuracy: 0.7935
              precision    recall  f1-score   support

           0       0.86      0.87      0.86      1061
           1       0.59      0.56      0.57       348

 

In [15]:
from sklearn.model_selection import RandomizedSearchCV
# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators' : [50, 100, 300, 500, 1000],
    'min_samples_split' : [2, 3, 5, 7, 9],
    'min_samples_leaf' : [1, 2, 4, 6, 8],
    'max_features' : ['auto', 'sqrt', 'log2', None] }

rs_et_model = ExtraTreesClassifier(random_state=1)
rs_et_search = RandomizedSearchCV(estimator=rs_et_model, param_distributions=param_dist, n_iter=10, 
                                  cv=5, verbose=1, random_state=1, n_jobs=-1, scoring='accuracy')
rs_et_search.fit(X_train_processed, y_train)

# Get the best parameters from the RandomizedSearchCV
best_params = rs_et_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the Extra Trees model with the best parameters
optimal_et_model = ExtraTreesClassifier(**best_params, random_state=1)
optimal_et_model.fit(X_train_processed, y_train)
optimal_et_pred = optimal_et_model.predict(X_test_processed)
optimal_et_accuracy = accuracy_score(y_test, optimal_et_pred)
optimal_et_report = classification_report(y_test, optimal_et_pred)
optimal_et_cm = confusion_matrix(y_test, optimal_et_pred)

print("Optimal Extra Trees Classifier:")
print(f"Accuracy: {optimal_et_accuracy:.4f}")
print(optimal_et_report)

# Compare accuracies
print(f"Initial Extra Trees Accuracy: {et_accuracy:.4f}")
print(f"Optimal Extra Trees Accuracy: {optimal_et_accuracy:.4f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/xylin/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/xylin/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/xylin/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/xylin/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_

Best Parameters: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}
Optimal Extra Trees Classifier:
Accuracy: 0.8041
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1061
           1       0.62      0.53      0.57       348

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.80      0.80      0.80      1409

Initial Extra Trees Accuracy: 0.7672
Optimal Extra Trees Accuracy: 0.8041


In [16]:
# Get feature importances
feature_importances = optimal_et_model.feature_importances_

# Create a DataFrame for the feature importances
features = X_train_processed.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Get the two most important features
top_two_features = importance_df.head(2)

print("Top two most important features:")
print(top_two_features)

Top two most important features:
                    Feature  Importance
37  Contract_Month-to-month    0.152237
0                    tenure    0.092800
