# ML Zoomcamp Homework 3: Classification

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mutual_info_score, accuracy_score

## Data Preparation

Download the dataset from:
https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

In [2]:
df = pd.read_csv('data/course_lead_scoring.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_cols = list(df.dtypes[df.dtypes == 'object'].index)
numerical_cols = list(df.dtypes[df.dtypes != 'object'].index)

for col in categorical_cols:
    df[col] = df[col].fillna('NA')

for col in numerical_cols:
    df[col] = df[col].fillna(0.0)

## Question 1

What is the most frequent value in the `industry` column after handling missing data?

In [3]:
df['industry'].mode()[0]

'retail'

## Question 2

Build a correlation matrix for numerical features. Which pair shows the strongest correlation?

In [27]:
# Get numerical columns (excluding the target 'converted')
numerical_features = [col for col in numerical_cols if col != 'converted']
correlation_matrix = df[numerical_features].corr()
print(correlation_matrix)

                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  


In [28]:
# Check specific pairs mentioned in the question
print(f"interaction_count & lead_score: {correlation_matrix.loc['interaction_count', 'lead_score']:.3f}")
print(f"number_of_courses_viewed & lead_score: {correlation_matrix.loc['number_of_courses_viewed', 'lead_score']:.3f}")
print(f"number_of_courses_viewed & interaction_count: {correlation_matrix.loc['number_of_courses_viewed', 'interaction_count']:.3f}")
print(f"annual_income & interaction_count: {correlation_matrix.loc['annual_income', 'interaction_count']:.3f}")

interaction_count & lead_score: 0.010
number_of_courses_viewed & lead_score: -0.005
number_of_courses_viewed & interaction_count: -0.024
annual_income & interaction_count: 0.027


## Question 3

Calculate mutual information scores between `converted` and categorical variables. Which categorical variable has the highest mutual information score?

In [29]:
# Split the data first (60/20/20)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

print(f"Train set size: {len(df_train)}")
print(f"Validation set size: {len(df_val)}")
print(f"Test set size: {len(df_test)}")

Train set size: 876
Validation set size: 293
Test set size: 293


In [30]:
# Calculate mutual information for categorical variables using training data only
categorical_features = ['industry', 'location', 'lead_source', 'employment_status']

y_train = df_train['converted']

mi_scores = {}
for col in categorical_features:
    mi = mutual_info_score(df_train[col], y_train)
    mi_scores[col] = round(mi, 2)
    print(f"{col}: {mi_scores[col]}")

# Find the highest scoring variable
best_feature = max(mi_scores, key=mi_scores.get)
print(f"\nHighest mutual information: {best_feature} with score {mi_scores[best_feature]}")

industry: 0.01
location: 0.0
lead_source: 0.04
employment_status: 0.01

Highest mutual information: lead_source with score 0.04


## Question 4

Train logistic regression with one-hot encoding. What is the validation accuracy?

In [31]:
# Prepare features for training
features = categorical_features + numerical_features
features = [f for f in features if f != 'converted']  # Remove target if present

# Convert to dictionaries for DictVectorizer
train_dicts = df_train[features].to_dict(orient='records')
val_dicts = df_val[features].to_dict(orient='records')

# One-hot encoding
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

# Prepare target variables
y_train = df_train['converted'].values
y_val = df_val['converted'].values

# Train logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Calculate validation accuracy
y_pred_val = model.predict(X_val)
accuracy_val = accuracy_score(y_val, y_pred_val)
print(f"Validation accuracy: {round(accuracy_val, 2)}")

Validation accuracy: 0.7


## Question 5

Perform feature elimination testing. Which feature has the smallest accuracy difference when removed?

In [32]:
# Baseline accuracy (from Question 4)
baseline_accuracy = accuracy_val
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

# Test features to eliminate
features_to_test = ['industry', 'employment_status', 'lead_score']

accuracy_differences = {}

for feature_to_remove in features_to_test:
    # Create feature list without the current feature
    features_subset = [f for f in features if f != feature_to_remove]
    
    # Convert to dictionaries
    train_dicts_subset = df_train[features_subset].to_dict(orient='records')
    val_dicts_subset = df_val[features_subset].to_dict(orient='records')
    
    # One-hot encoding
    dv_subset = DictVectorizer(sparse=False)
    X_train_subset = dv_subset.fit_transform(train_dicts_subset)
    X_val_subset = dv_subset.transform(val_dicts_subset)
    
    # Train model
    model_subset = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_subset.fit(X_train_subset, y_train)
    
    # Calculate accuracy
    y_pred_subset = model_subset.predict(X_val_subset)
    accuracy_subset = accuracy_score(y_val, y_pred_subset)
    
    # Calculate difference
    diff = baseline_accuracy - accuracy_subset
    accuracy_differences[feature_to_remove] = diff
    
    print(f"Without {feature_to_remove}: accuracy = {round(accuracy_subset, 2)}, difference = {round(diff, 3)}")

# Find feature with smallest difference
smallest_diff_feature = min(accuracy_differences, key=accuracy_differences.get)
print(f"\nFeature with smallest difference: {smallest_diff_feature} ({round(accuracy_differences[smallest_diff_feature], 3)})")

Baseline accuracy: 0.7
Without industry: accuracy = 0.7, difference = 0.0
Without employment_status: accuracy = 0.7, difference = 0.003
Without lead_score: accuracy = 0.71, difference = -0.007

Feature with smallest difference: lead_score (-0.007)


## Question 6

Test regularization by training models with different C values. Which C produces the best validation accuracy?

In [33]:
# Test different C values
c_values = [0.01, 0.1, 1, 10, 100]

results = {}

for c in c_values:
    # Train model with current C value
    model_c = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model_c.fit(X_train, y_train)
    
    # Calculate validation accuracy
    y_pred_c = model_c.predict(X_val)
    accuracy_c = accuracy_score(y_val, y_pred_c)
    
    results[c] = accuracy_c
    print(f"C = {c}: accuracy = {round(accuracy_c, 3)}")

# Find best C value
best_c = max(results, key=results.get)
best_accuracy = results[best_c]

# Check for ties and select smallest C if tied
best_cs = [c for c, acc in results.items() if acc == best_accuracy]
final_c = min(best_cs)

print(f"\nBest C value: {final_c} with accuracy {round(best_accuracy, 3)}")

C = 0.01: accuracy = 0.7
C = 0.1: accuracy = 0.7
C = 1: accuracy = 0.7
C = 10: accuracy = 0.7
C = 100: accuracy = 0.7

Best C value: 0.01 with accuracy 0.7
