In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))


In [3]:
import numpy as np
from sklearn.metrics import recall_score,accuracy_score,precision_score
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier 
import warnings
import os
from pathlib import Path
from dotenv import load_dotenv


from data.data_utils import DataLoader,split_data

In [4]:
warnings.filterwarnings("ignore")
np.random.seed(42)

In [5]:
env_path = Path('.env')
load_dotenv(env_path)

root_dir = Path(os.getenv('ROOT_DIRECTORY'))
data_path = root_dir/'data'/'raw'/'WA_Fn-UseC_-Telco-Customer-Churn.csv'

# Data preprocessing

The `DataLoader` class is responsible for loading data from a CSV file and performing basic preprocessing steps, including handling categorical variables and missing values. It also provides a method to preprocess data by creating dummy variables for categorical features. The `split_data` function then splits the preprocessed data into training, testing, and validation datasets, ensuring a balanced distribution of the target variable.


For more detailed information, you can refer to the [documentation](../docs/DataLoader.md) or check out the [source code](../src/data/data_utils.py).


In [6]:
#load the data
data_loader = DataLoader(data_path)
df = data_loader.load_data()
encd_df = data_loader.preprocess_data()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             7043 non-null   int64  
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   object 
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Data Splitting

In [8]:
train_set,test_set,train_set_splitted,val_set = split_data(encd_df.dropna())
X_train , y_train , X_test , y_test = train_set.drop(columns=['Churn','index']) , train_set['Churn'] , test_set.drop(columns=['index','Churn']) , test_set['Churn']
X_train_splitted , y_train_splitted, X_val,y_val = train_set_splitted.drop('Churn',axis = 1 ) , train_set_splitted['Churn'] , val_set.drop('Churn',axis = 1) , val_set['Churn']
X_train_splitted.shape

(4788, 47)

In [9]:
# there is class imbalance that can affect results
# To experiment with balance dataset I am using SMOTE algorithm.
X_train_smoted,y_train_smoted = SMOTE().fit_resample(X_train_splitted,y_train_splitted)

In [10]:
smoted_df = X_train_smoted
smoted_df['Churn'] = y_train_smoted
smoted_df = smoted_df.drop(columns='index').reset_index()

In [11]:
# train_set.to_csv(root_dir/'data'/'interim'/'train_set.csv',index=False)
# encd_df.to_csv(root_dir/'data'/'interim'/'encd_df.csv',index=False)
# smoted_df.to_csv(root_dir/'data'/'interim'/'smoted_df.csv',index=False)
# val_set.to_csv(root_dir/'data'/'interim'/'val_set.csv',index=False)
# test_set.to_csv(root_dir/'data'/'interim'/'test_set.csv',index=False)
# train_set_splitted.to_csv(root_dir/'data'/'interim'/'train_set_splitted.csv',index=False)

### Let us define the base line score for our future work.

In [12]:
model = LGBMClassifier(verbose=-1).fit(X_train_splitted,y_train_splitted)
y_preds = model.predict(X_val)

I am going to use weighted recall (it's formula is ``0.65 * recall + 0.35 * precision``) as my primary evaluation metric. As recall is more important in this Project.

In [13]:
print("accuracy score is " ,accuracy_score(y_true=y_val,y_pred=y_preds))

print("precision score is " ,precision_score(y_true=y_val,y_pred=y_preds))

print("recall score is " ,recall_score(y_true=y_val,y_pred=y_preds))

print("weighted recall  score is " , 0.65 *recall_score(y_true=y_val,y_pred=y_preds) + 0.35 * precision_score(y_true=y_val,y_pred=y_preds))

accuracy score is  0.7836879432624113
precision score is  0.6171428571428571
recall score is  0.48214285714285715
weighted recall  score is  0.5293928571428571
