In [None]:
%load_ext autoreload
%autoreload 2

In [18]:
%load_ext autoreload
%autoreload 2
import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import warnings 

from data.data_utils import DataLoader,split_data

In [5]:
warnings.filterwarnings("ignore")
np.random.seed(42)

# Data preprocessing

In [6]:
#load the data
data_loader = DataLoader("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df = data_loader.load_data()
encd_df = data_loader.preprocess_data()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             7043 non-null   int64  
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   object 
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [8]:
train_set,test_set,train_set_splitted,val_set = split_data(encd_df.dropna())
X_train , y_train , X_test , y_test = train_set.drop(columns=['Churn','index']) , train_set['Churn'] , test_set.drop(columns=['index','Churn']) , test_set['Churn']
X_train_splitted , y_train_splitted, X_val,y_val = train_set_splitted.drop('Churn',axis = 1 ) , train_set_splitted['Churn'] , val_set.drop('Churn',axis = 1) , val_set['Churn']
X_train_splitted.shape

(4781, 47)

In [9]:
# there is class imbalance that can affect results
# To experiment with balance dataset I am using SMOTE algorithm.
X_train_smoted,y_train_smoted = SMOTE().fit_resample(X_train_splitted,y_train_splitted)

In [10]:
smoted_df = X_train_smoted
smoted_df['Churn'] = y_train_smoted
smoted_df = smoted_df.drop(columns='index').reset_index()

## Feature Selection and Importance

### First we will train a lgb model. This model will act as base line model.

In [19]:
class GetFeatureImportance:
    def __init__(self, train_data, test_data, df=None, model_name=None):
        self.model_name = model_name  # Store the model name
        self.train_data = train_data  # Training data
        self.test_data = test_data  # Test data
        self.df = df  # Additional DataFrame if needed

    def train_lightgbm(self, params=None):
        # Get feature columns excluding 'Churn'
        columns = self.train_data.drop('Churn', axis=1).columns
        
        # Create LightGBM datasets for training and testing
        train_data = lgb.Dataset(self.train_data.drop('Churn', axis=1), label=self.train_data['Churn'])
        test_data = lgb.Dataset(self.test_data.drop('Churn', axis=1), label=self.test_data['Churn'], reference=train_data)
        
        # Define default parameters if none are provided
        if params is None: 
            params = {
                'objective': 'binary',
                'boosting_type': 'gbdt',
                'metric': 'auc',
                'num_leaves': 31,
                'learning_rate': 0.05,
                'feature_fraction': 0.9,
                'seed': 42,
                # 'verbose': -1,  # Uncomment if verbose output is needed
            }
        
        evals_result = {}  # Dictionary to store evaluation results
        callbacks = [lgb.early_stopping(stopping_rounds=200), lgb.record_evaluation(evals_result)]
        
        # Train the LightGBM model
        booster = lgb.train(params, train_data, num_boost_round=3000, valid_sets=[test_data], callbacks=callbacks)
        
        # Get the AUC score from the evaluation results
        auc_score = evals_result['valid_0']['auc'][-1]
        
        # Get feature importance
        feature_importance = booster.feature_importance(importance_type='gain')
        feature_importance_df = pd.DataFrame({
            'features': columns,
            'importance': feature_importance
        }).sort_values(by='importance', ascending=False)
        
        # Add the AUC score to the feature importance DataFrame
        feature_importance_df['auc'] = auc_score
        
        return feature_importance_df, booster

In [20]:
feature_importance_df ,_= GetFeatureImportance(train_set_splitted.drop(columns='index'),val_set.drop(columns='index')).train_lightgbm()

[LightGBM] [Info] Number of positive: 1271, number of negative: 3510
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001022 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 669
[LightGBM] [Info] Number of data points in the train set: 4781, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265844 -> initscore=-1.015812
[LightGBM] [Info] Start training from score -1.015812
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[31]	valid_0's auc: 0.832604


In [21]:
feature_importance_df.head(20)

Unnamed: 0,features,importance,auc
37,Contract_Month-to-month,8269.075111,0.817638
0,tenure,2270.934093,0.817638
1,MonthlyCharges,1471.169732,0.817638
17,InternetService_Fiber optic,1219.658396,0.817638
2,TotalCharges,1137.229452,0.817638
19,OnlineSecurity_No,1030.521527,0.817638
28,TechSupport_No,682.957915,0.817638
44,PaymentMethod_Electronic check,345.315008,0.817638
16,InternetService_DSL,237.240579,0.817638
38,Contract_One year,217.544422,0.817638


In [22]:
unimportant_features = feature_importance_df[feature_importance_df['importance']==0]['features'].to_list()
unimportant_features.append('index')

In [23]:
feature_importance_df ,_= GetFeatureImportance(train_set_splitted.drop(columns=unimportant_features),val_set.drop(columns=unimportant_features)).train_lightgbm()

[LightGBM] [Info] Number of positive: 1271, number of negative: 3510
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001044 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 651
[LightGBM] [Info] Number of data points in the train set: 4781, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265844 -> initscore=-1.015812
[LightGBM] [Info] Start training from score -1.015812
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[15]	valid_0's auc: 0.832388


In [24]:
feature_importance_df

Unnamed: 0,features,importance,auc
28,Contract_Month-to-month,6174.999069,0.819348
0,tenure,1821.627887,0.819348
16,InternetService_Fiber optic,1105.264971,0.819348
17,OnlineSecurity_No,965.017232,0.819348
2,TotalCharges,833.237064,0.819348
1,MonthlyCharges,768.368939,0.819348
23,TechSupport_No,335.327837,0.819348
15,InternetService_DSL,265.111167,0.819348
35,PaymentMethod_Electronic check,213.877851,0.819348
29,Contract_One year,152.41819,0.819348


In [25]:
# let's examine how the smoted dataset is peroforming
feature_importance_df,_ = GetFeatureImportance(smoted_df.drop(columns='index'),val_set.drop(columns='index')).train_lightgbm()

[LightGBM] [Info] Number of positive: 3510, number of negative: 3510
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 669
[LightGBM] [Info] Number of data points in the train set: 7020, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[33]	valid_0's auc: 0.835448


In [26]:
feature_importance_df

Unnamed: 0,features,importance,auc
37,Contract_Month-to-month,20111.944012,0.821965
19,OnlineSecurity_No,4726.000596,0.821965
17,InternetService_Fiber optic,4701.457024,0.821965
28,TechSupport_No,2420.10026,0.821965
0,tenure,2383.727351,0.821965
2,TotalCharges,1893.732107,0.821965
44,PaymentMethod_Electronic check,1490.101055,0.821965
16,InternetService_DSL,1271.666442,0.821965
1,MonthlyCharges,1225.417672,0.821965
38,Contract_One year,934.042466,0.821965
