In [40]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-14 20:55:41--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-14 20:55:41 (40.5 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



# Data preparation
Check if the missing values are presented in the features.
- If there are missing values:
- For categorical features, replace them with 'NA'
- For numerical features, replace with with 0.0

In [41]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('../03-classification/course_lead_scoring.csv')

In [42]:
print(df.columns)

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')


In [43]:
print("Missing values per column before filling:")
print(df.isnull().sum())

Missing values per column before filling:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [49]:
# Fill missing values
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(include=['number']).columns

print('Categorical columns:', cat_cols)
print('Numerical of Cloumns:', num_cols)



Categorical columns: Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')
Numerical of Cloumns: Index(['number_of_courses_viewed', 'annual_income', 'interaction_count',
       'lead_score', 'converted'],
      dtype='object')


In [50]:
df[cat_cols] = df[cat_cols].fillna('NA')
df[num_cols] = df[num_cols].fillna(0.0)

print("\nMissing values per column after filling:")
print(df.isnull().sum())


Missing values per column after filling:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [51]:
print(df.isnull().sum())

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


# Question 1
What is the most frequent observation (mode) for the column industry?

In [52]:
mode_industry = df['industry'].mode()[0]
print("Mode of 'industry':", mode_industry)


Mode of 'industry': retail


# Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?


In [54]:
numeric_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
corr_matrix = df[numeric_cols].corr()
print("Correlation matrix:\n", corr_matrix)

pairs = {
    'interaction_count and lead_score': corr_matrix.loc['interaction_count', 'lead_score'],
    'number_of_courses_viewed and lead_score': corr_matrix.loc['number_of_courses_viewed', 'lead_score'],
    'number_of_courses_viewed and interaction_count': corr_matrix.loc['number_of_courses_viewed', 'interaction_count'],
    'annual_income and interaction_count': corr_matrix.loc['annual_income', 'interaction_count']
}

strongest_pair = max(pairs, key=lambda k: abs(pairs[k]))
print("Strongest correlation pair:", strongest_pair, pairs[strongest_pair])

Correlation matrix:
                           number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  
Strongest correlation pair: annual_income and interaction_count 0.02703647240481443


# Split the data
- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value converted is not in your dataframe.

In [56]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['converted'])
y = df['converted']

# 60% train, 20% val, 20% test split
X_train_full, X_temp, y_train_full, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Train size: {len(X_train_full)}, Validation size: {len(X_val)}, Test size: {len(X_test)}")

Train size: 877, Validation size: 292, Test size: 293


# Question 3
- Calculate the mutual information score between converted and other categorical variables in the dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).

Which of these variables has the biggest mutual information score?

In [58]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
import numpy as np

cat_features = ['industry', 'location', 'lead_source', 'employment_status']
X_train_enc = X_train_full.copy()
for col in cat_features:
    le = LabelEncoder()
    X_train_enc[col] = le.fit_transform(X_train_enc[col].astype(str))

mi_scores = mutual_info_classif(X_train_enc[cat_features], y_train_full)
mi_dict = dict(zip(cat_features, [round(score, 2) for score in mi_scores]))
print("Mutual information scores:")
print(mi_dict)

max_mi_feature = max(mi_dict, key=mi_dict.get)
print("Highest mutual information feature:", max_mi_feature)

Mutual information scores:
{'industry': np.float64(0.0), 'location': np.float64(0.01), 'lead_source': np.float64(0.0), 'employment_status': np.float64(0.01)}
Highest mutual information feature: location


# Question 4
- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
- To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
- model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [64]:
import sklearn
print(sklearn.__version__)

1.7.0


In [65]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_cat = encoder.fit_transform(X_train_full[cat_features].fillna('NA'))
val_cat = encoder.transform(X_val[cat_features].fillna('NA'))

num_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
X_train_num = X_train_full[num_features].fillna(0.0).values
X_val_num = X_val[num_features].fillna(0.0).values

X_train_final = np.concatenate([train_cat, X_train_num], axis=1)
X_val_final = np.concatenate([val_cat, X_val_num], axis=1)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_final, y_train_full)
y_val_pred = model.predict(X_val_final)
val_accuracy = round(accuracy_score(y_val, y_val_pred), 2)
print("Validation accuracy:", val_accuracy)


Validation accuracy: 0.74


# Question 5
- Let's find the least useful feature using the feature elimination technique.
- Train a model using the same features and parameters as in Q4 (without rounding).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

  Which of following feature has the smallest difference?

In [66]:
features_all = cat_features + num_features
accuracy_diffs = {}

for feature in features_all:
    drop_cat = [f for f in cat_features if f != feature]
    drop_num = [f for f in num_features if f != feature]

    train_cat_drop = encoder.fit_transform(X_train_full[drop_cat].fillna('NA')) if drop_cat else np.empty((len(X_train_full), 0))
    val_cat_drop = encoder.transform(X_val[drop_cat].fillna('NA')) if drop_cat else np.empty((len(X_val), 0))

    train_num_drop = X_train_full[drop_num].fillna(0.0).values if drop_num else np.empty((len(X_train_full), 0))
    val_num_drop = X_val[drop_num].fillna(0.0).values if drop_num else np.empty((len(X_val), 0))

    X_train_drop = np.concatenate([train_cat_drop, train_num_drop], axis=1)
    X_val_drop = np.concatenate([val_cat_drop, val_num_drop], axis=1)

    model_drop = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_drop.fit(X_train_drop, y_train_full)
    y_pred_drop = model_drop.predict(X_val_drop)
    accuracy_drop = accuracy_score(y_val, y_pred_drop)

    accuracy_diffs[feature] = val_accuracy - accuracy_drop

print("Accuracy differences when excluding each feature:")
print(accuracy_diffs)

min_diff_feature = min(accuracy_diffs, key=lambda k: abs(accuracy_diffs[k]))
print("Feature with smallest difference:", min_diff_feature)

Accuracy differences when excluding each feature:
{'industry': -0.003150684931506831, 'location': -0.003150684931506831, 'lead_source': 0.010547945205479414, 'employment_status': -0.00657534246575342, 'number_of_courses_viewed': 0.06191780821917803, 'annual_income': -0.11616438356164382, 'interaction_count': 0.06534246575342462, 'lead_score': -0.003150684931506831}
Feature with smallest difference: industry


# Question 6
- Now let's train a regularized logistic regression.
- Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
- Train models using all the features as in Q4.
- Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?

In [67]:
C_values = [0.01, 0.1, 1, 10, 100]
accuracy_per_C = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_final, y_train_full)
    y_val_pred = model.predict(X_val_final)
    accuracy_per_C[C] = round(accuracy_score(y_val, y_val_pred), 3)

print("Validation accuracies for different C values:")
print(accuracy_per_C)

best_C = max(accuracy_per_C, key=accuracy_per_C.get)
print("Best C value:", best_C, "with accuracy:", accuracy_per_C[best_C])


Validation accuracies for different C values:
{0.01: 0.743, 0.1: 0.743, 1: 0.743, 10: 0.743, 100: 0.743}
Best C value: 0.01 with accuracy: 0.743
