#### Module-3 Churn Classification

In [2]:
# Import library
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('/workspaces/machine_learning_zoomcamp/bank-full.csv', sep=';')
# Display the first few rows to understand the structure of the dataset
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


#### Question-1

In [8]:
education_mode = df['education'].mode()[0]
education_mode

'secondary'

#### Question-2

In [11]:
correlation_matrix = df.select_dtypes(include='number').corr()

correlation_matrix_unstacked = correlation_matrix.unstack()
correlation_matrix_unstacked = correlation_matrix_unstacked[correlation_matrix_unstacked < 1.0]
highest_correlation_pair = correlation_matrix_unstacked.idxmax(), correlation_matrix_unstacked.max()
highest_correlation_pair

(('pdays', 'previous'), np.float64(0.4548196354805043))

#### Question-3

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

df['y'] = df['y'].apply(lambda x: 1 if x == 'yes' else 0)

train_full, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['y'])
train, val = train_test_split(train_full, test_size=0.25, random_state=42, stratify=train_full['y'])

X_train = train.drop(columns=['y'])
y_train = train['y']

X_train_encoded = X_train.copy()
categorical_features = X_train_encoded.select_dtypes(include=['object']).columns

for column in categorical_features:
    le = LabelEncoder()
    X_train_encoded[column] = le.fit_transform(X_train_encoded[column].astype(str))

mi_scores = mutual_info_classif(X_train_encoded[categorical_features], y_train)
mi_scores_rounded = {cat: round(score, 2) for cat, score in zip(categorical_features, mi_scores)}

highest_mi_variable = max(mi_scores_rounded, key=mi_scores_rounded.get), mi_scores_rounded[max(mi_scores_rounded, key=mi_scores_rounded.get)]
highest_mi_variable

('poutcome', np.float64(0.03))

#### Question-4

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

categorical_cols = X_train.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer([('onehot', OneHotEncoder(drop='first'), categorical_cols)], remainder='passthrough')

model = make_pipeline(preprocessor, LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))

model.fit(X_train, y_train)

X_val = val.drop(columns=['y'])
y_val = val['y']
y_pred = model.predict(X_val)

val_accuracy = round(accuracy_score(y_val, y_pred), 2)
val_accuracy

0.9

#### Question-5

In [37]:
accuracy_differences_individual = {}

model_all_features = make_pipeline(preprocessor, LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
model_all_features.fit(X_train, y_train)
val_accuracy_all_features = accuracy_score(y_val, model_all_features.predict(X_val))

for feature in X_train.columns:
    X_train_feature_excluded = X_train.drop(columns=[feature])
    X_val_feature_excluded = X_val.drop(columns=[feature])
    
    categorical_cols_excluded = X_train_feature_excluded.select_dtypes(include=['object']).columns
    preprocessor_excluded = ColumnTransformer([('onehot', OneHotEncoder(drop='first'), categorical_cols_excluded)], remainder='passthrough')
    
    model_feature_excluded = make_pipeline(
        preprocessor_excluded,
        LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    )
    model_feature_excluded.fit(X_train_feature_excluded, y_train)
    
    # Calculate accuracy without the feature
    feature_excluded_accuracy = accuracy_score(y_val, model_feature_excluded.predict(X_val_feature_excluded))
    
    accuracy_differences_individual[feature] = round(val_accuracy_all_features - feature_excluded_accuracy, 2)

accuracy_differences_individual

{'age': -0.0,
 'job': 0.0,
 'marital': -0.0,
 'education': 0.0,
 'default': -0.0,
 'balance': 0.0,
 'housing': 0.0,
 'loan': -0.0,
 'contact': -0.0,
 'day': -0.0,
 'month': 0.0,
 'duration': 0.01,
 'campaign': -0.0,
 'pdays': 0.0,
 'previous': -0.0,
 'poutcome': 0.01}

Based on the result it might be the age factor

#### Question-6

In [38]:
c_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

for c in c_values:
    model = make_pipeline(
        preprocessor,
        LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    )
    model.fit(X_train, y_train)
    
    # Calculate accuracy on the validation set and store it rounded to 3 decimal places
    val_accuracy = round(accuracy_score(y_val, model.predict(X_val)), 3)
    accuracies[c] = val_accuracy

# Finding the C value that leads to the best accuracy
best_c = max(accuracies, key=accuracies.get), accuracies[max(accuracies, key=accuracies.get)]
best_c

(1, 0.902)