###  2a. Build a machine learning model to predict customer churn based on a given dataset. Train the model using appropriate algorithms and evaluate its performance.

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import requests

In [2]:
df_csv=pd.read_csv("https://raw.githubusercontent.com/d1b2/DSA_practice/main/assign_4/data_formats/data.csv")

In [3]:
def data_validation(dataframe):
    """Performs data validation on given pandas dataframe"""
    try:
        col_valid_status=True
        col_list=['customerID','gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
           'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
           'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
           'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
           'MonthlyCharges', 'TotalCharges']
        for i in range(len(col_list)):
            if col_list[i] not in dataframe.columns:
                col_valid_status=False
                print(f'{i} Column not present in dataset!!! Data Validation failed.!')
                return False
        print('Input Columns Validation successful!')
        target_valid_status=True        
        target_col='Churn'
        if target_col not in dataframe.columns:
            target_valid_status=False
            print('Target Column not present in dataset!!! Data Validation failed.!!')
            return False
        print('Target Columns Validation successful!!')
        tar_domain_status=True
        if sorted(dataframe[target_col].unique())!=['No', 'Yes']:
            tar_domain_status=False
            print('Target Column do not have valid domain values!!! Data Validation failed.!!!')
            return False
        print('Data Validation succesful!!!')
        return col_valid_status and target_valid_status and tar_domain_status
    except Exception as e:
            raise e

In [4]:
def data_cleansing(dataframe):
    """ Performs data cleansing on given pandas dataframe"""
    try:
        #null deleting from a col
        dataframe['TotalCharges'].replace(' ', np.nan, inplace=True)
        dataframe.dropna(how='any',axis=0,inplace=True) 
        print('Null values in TotalCharges column deleted.')
        dataframe['TotalCharges']=dataframe['TotalCharges'].astype('float')
        #deleting extra column
        dataframe.drop("customerID",axis=1,inplace=True)
        print('customerID column deleted.')

        #deleting duplicates
        num_duplicates=dataframe.duplicated().sum()  
        if num_duplicates!=0:
            dataframe = dataframe.drop_duplicates()
            dataframe = dataframe.reset_index(drop=True)
            print(f'Number of duplicate rows deleted: {num_duplicates}.')
        else:
            print('No duplicate rows present.')
        target_col='Churn'
        if np.issubdtype(dataframe[target_col].dtype, np.number)==False:
            dataframe[target_col] = dataframe[target_col].apply(lambda x: 1 if x=='Yes' else 0)
            print("Target feature converted numeric type.")
        else:
            print("Target feature is numeric type.")
        
        print("Data Cleansing completed")
        
        return dataframe
    
    except Exception as e:
            raise e

In [5]:
data_validation(df_csv)

Input Columns Validation successful!
Target Columns Validation successful!!
Data Validation succesful!!!


True

In [6]:
df_csv = data_cleansing(df_csv)

Null values in TotalCharges column deleted.
customerID column deleted.
Number of duplicate rows deleted: 22.
Target feature converted numeric type.
Data Cleansing completed


In [7]:
df_csv.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [8]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7010 entries, 0 to 7009
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7010 non-null   object 
 1   SeniorCitizen     7010 non-null   int64  
 2   Partner           7010 non-null   object 
 3   Dependents        7010 non-null   object 
 4   tenure            7010 non-null   int64  
 5   PhoneService      7010 non-null   object 
 6   MultipleLines     7010 non-null   object 
 7   InternetService   7010 non-null   object 
 8   OnlineSecurity    7010 non-null   object 
 9   OnlineBackup      7010 non-null   object 
 10  DeviceProtection  7010 non-null   object 
 11  TechSupport       7010 non-null   object 
 12  StreamingTV       7010 non-null   object 
 13  StreamingMovies   7010 non-null   object 
 14  Contract          7010 non-null   object 
 15  PaperlessBilling  7010 non-null   object 
 16  PaymentMethod     7010 non-null   object 


### 3c Design a model validation strategy that incorporates stratified sampling to handle imbalanced datasets.

#### To handle imbalance dataset StratifiedShuffleSplit will be performed on dataset

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit

In [10]:
def stratified_split(df,col,target):
    """Performs stratified schuffle spilit on dataset"""
    if col not in df:
        return f"{col} column not found in dataframe."
    if np.issubdtype(df[col].dtype, np.number)==False:
        return f"{col} is not numeric"
    print(f'Shape of original dataframe: {df.shape}')
    cat_col=col+"_cat"
    df[cat_col]=pd.qcut(
            df[col], q=5,       
            labels=[1,2,3,4,5])
    print(f"New {cat_col} created in dataframe for performing stratifiesd schuflle split")

    strat_train_set = None
    strat_test_set = None

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index,test_index in split.split(df,df[cat_col]):
        strat_train_set = df.loc[train_index].drop([cat_col],axis=1)
        strat_test_set= df.loc[test_index].drop([cat_col],axis=1)
    print(f"Stratifiesd schuflle split performed.")
    strat_train_set = strat_train_set.reset_index(drop=True)
    strat_test_set = strat_test_set.reset_index(drop=True)
    
    X_train,y_train = strat_train_set.drop(target,axis=1),strat_train_set[target]
    X_test,y_test = strat_test_set.drop(target,axis=1),strat_test_set[target] 
    print(f'Shape of X_train: {X_train.shape} | Shape of X_test: {X_test.shape}')
    print(f'Shape of y_train: {y_train.shape} | Shape of y_test: {y_test.shape}')
    
    return X_train,X_test,y_train,y_test

In [11]:
#call to function
X_train,X_test,y_train,y_test=stratified_split(df_csv,'MonthlyCharges','Churn')

Shape of original dataframe: (7010, 20)
New MonthlyCharges_cat created in dataframe for performing stratifiesd schuflle split
Stratifiesd schuflle split performed.
Shape of X_train: (5608, 19) | Shape of X_test: (1402, 19)
Shape of y_train: (5608,) | Shape of y_test: (1402,)


### 2b. Develop a model training pipeline that incorporates feature engineering techniques such as one-hot encoding, feature scaling, and dimensionality reduction.

In [12]:
def get_num_cat_columns(dataframe):
    try:                    
        num_columns=[i for i in dataframe.columns if dataframe[i].dtype=='float64' or dataframe[i].dtype=='int64' ]
        cat_columns=[i for i in dataframe.columns if dataframe[i].dtype=='object']                 

        print("Lists of numerical and categorical features generated.")
        return num_columns,cat_columns
    except Exception as e:
        raise e

In [13]:
num,cat=get_num_cat_columns(X_train)

Lists of numerical and categorical features generated.


In [14]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score

In [15]:
num,cat=get_num_cat_columns(X_train)

Lists of numerical and categorical features generated.


In [16]:
def get_column_transformer(dataframe):
        try:
            numerical_features,categorical_features=get_num_cat_columns(dataframe)
            #for numerical features
            scaler=StandardScaler()            
            #for categorical features
            one_hot_encoder=OneHotEncoder(drop='first', sparse_output=False)
            trf1 = ColumnTransformer(transformers =[
                                        ('enc', one_hot_encoder, categorical_features),
                                        ('scaler',scaler,numerical_features),], remainder ='passthrough')            
           
            print("Column transformer object generated.")
           
            return trf1
        except Exception as e:
            raise e

In [17]:
pipe1 = Pipeline(steps =[
    ('trf1', get_column_transformer(X_train)), 
    ('trf2',PCA(n_components=10)),
    ('model', RandomForestClassifier(random_state=42))
])

Lists of numerical and categorical features generated.
Column transformer object generated.


In [18]:
pipe2 = Pipeline(steps =[
    ('trf1', get_column_transformer(X_train)), 
    ('trf2',PCA(n_components=2)),
    ('model', LogisticRegression(random_state=42))
])

Lists of numerical and categorical features generated.
Column transformer object generated.


### 3b. Perform model validation using different evaluation metrics such as accuracy, precision, recall, and F1 score for a binary classification problem.

In [19]:
pipe1.fit(X_train, y_train)
y_pred=pipe1.predict(X_test)

acc=round(accuracy_score(y_test, y_pred),2)
pr=round(precision_score(y_test, y_pred),2)
rec=round(recall_score(y_test, y_pred),2)
f1= round(f1_score(y_test, y_pred),2)

print(f"{' '*15}Score metrics of Random Forest Classifier pipeline \n{'='*80}")
print(f"Accuracy : {acc}  ||  Precison : {pr}  ||  Precision : {rec}  ||  F1 Score : {f1}")

               Score metrics of Random Forest Classifier pipeline 
Accuracy : 0.78  ||  Precison : 0.6  ||  Precision : 0.42  ||  F1 Score : 0.5


In [20]:
pipe2.fit(X_train, y_train)
y_pred=pipe2.predict(X_test)
acc=round(accuracy_score(y_test, y_pred),2)
pr=round(precision_score(y_test, y_pred),2)
rec=round(recall_score(y_test, y_pred),2)
f1= round(f1_score(y_test, y_pred),2)

print(f"{' '*15}Score metrics of Logistic Regression Classifier pipeline \n{'='*80}")
print(f"Accuracy : {acc}  ||  Precison : {pr}  ||  Precision : {rec}  ||  F1 Score : {f1}")

               Score metrics of Logistic Regression Classifier pipeline 
Accuracy : 0.78  ||  Precison : 0.61  ||  Precision : 0.43  ||  F1 Score : 0.5
