### Data Preparation

In [1]:
import pandas as pd


In [2]:
df= pd.read_csv(r'C:\Users\Owner\Desktop\csv data\Bank_Marketing.csv')

In [4]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [6]:
# Checking for missing values
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)

Missing values per column:
 lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [13]:
#Replacing missing values presented in features

df["lead_source"] = df["lead_source"].fillna('NA')
df["industry"] = df["industry"].fillna('NA')
df["annual_income"] = df["annual_income"].fillna(0.0)
df["employment_status"] = df["employment_status"].fillna('NA')
df["location"] = df["location"].fillna('NA')


In [14]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


### Question 1

In [17]:
mode_value = df["industry"].mode()[0]
print("Most frequent industry:", mode_value)


Most frequent industry: retail


### Question 2

In [23]:
numerical = df.select_dtypes(include=['number'])
numerical.head()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
0,1,79450.0,4,0.94,1
1,1,46992.0,1,0.8,0
2,5,78796.0,3,0.69,1
3,2,83843.0,1,0.87,0
4,3,85012.0,3,0.62,1


In [24]:

correlation_matrix = numerical_df.corr()

print("Correlation Matrix:\n", correlation_matrix)

Correlation Matrix:
                           number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  


### Split data

In [25]:
#Split data in train/val/test sets, with 60%/20%/20% distribution
n= len(df)

n_val= int(0.2*n)
n_test=int(0.2*n)

n_train=n-(n_val+n_test)

In [28]:
n_train , n_val  , n_test

(878, 292, 292)

In [29]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop(columns=['converted'])  
y = df['converted']

# First split: train (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Second split: validation (20%) and test (20%) from temp
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


### Question 3

In [30]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder


categorical_cols = ['industry', 'location', 'lead_source', 'employment_status']

# Encode categorical features
X_cat = X_train[categorical_cols].apply(LabelEncoder().fit_transform)

# Encode target variable
y_encoded = LabelEncoder().fit_transform(y_train)

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_cat, y_encoded, discrete_features=True)

# Round and display scores
mi_results = {col: round(score, 2) for col, score in zip(categorical_cols, mi_scores)}
print("Mutual Information Scores:\n", mi_results)

# Identify the variable with the highest score
max_feature = max(mi_results, key=mi_results.get)
print("Feature with highest mutual information score:", max_feature)


Mutual Information Scores:
 {'industry': np.float64(0.02), 'location': np.float64(0.0), 'lead_source': np.float64(0.03), 'employment_status': np.float64(0.02)}
Feature with highest mutual information score: lead_source


### Question 4

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()

# Create a column transformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # Keep numerical columns as-is
)

# Create a pipeline with preprocessing and logistic regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
])

# Fit the model on training data
model.fit(X_train, y_train)

# Predict on validation data
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", round(accuracy, 2))


Validation Accuracy: 0.74


### Question 5

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Baseline model
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough'
)

baseline_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
])

baseline_model.fit(X_train, y_train)
baseline_accuracy = accuracy_score(y_val, baseline_model.predict(X_val))

# Step 2: Feature elimination
features_to_test = ['industry', 'employment_status', 'lead_score']
accuracy_diffs = {}

for feature in features_to_test:
    # Drop the feature
    X_train_reduced = X_train.drop(columns=[feature])
    X_val_reduced = X_val.drop(columns=[feature])
    
    # Update categorical columns
    reduced_cats = [col for col in categorical_cols if col != feature]
    
    # Rebuild pipeline
    reduced_preprocessor = ColumnTransformer(
        transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), reduced_cats)],
        remainder='passthrough'
    )
    
    reduced_model = Pipeline(steps=[
        ('preprocessor', reduced_preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])
    
    # Train and evaluate
    reduced_model.fit(X_train_reduced, y_train)
    reduced_accuracy = accuracy_score(y_val, reduced_model.predict(X_val_reduced))
    
    # Record difference
    accuracy_diffs[feature] = baseline_accuracy - reduced_accuracy

# Step 3: Identify least useful feature
least_impact_feature = min(accuracy_diffs, key=lambda k: accuracy_diffs[k])
print("Accuracy differences:", accuracy_diffs)
print("Least useful feature:", least_impact_feature)


Accuracy differences: {'industry': 0.0, 'employment_status': -0.003424657534246589, 'lead_score': 0.0}
Least useful feature: employment_status


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# C values to test
C_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()

# Loop through each C value
for C in C_values:
    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
        remainder='passthrough'
    )
    
    # Model pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42))
    ])
    
    # Train and evaluate
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    accuracies[C] = round(acc, 3)

# Find best C (smallest C if tie)
best_C = min([k for k, v in accuracies.items() if v == max(accuracies.values())])

print("Validation Accuracies:", accuracies)
print("Best C value:", best_C)


Validation Accuracies: {0.01: 0.743, 0.1: 0.743, 1: 0.743, 10: 0.743, 100: 0.743}
Best C value: 0.01
