In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score


In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-21 13:22:13--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-21 13:22:13 (34.0 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



# Data preparation
Check if the missing values are presented in the features.
- If there are missing values:
- For categorical features, replace them with 'NA'
- For numerical features, replace with with 0.0

In [2]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('../03-classification/course_lead_scoring.csv')

In [5]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [6]:
print("Missing values per column before filling:")
print(df.isnull().sum())

Missing values per column before filling:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [7]:
# Fill missing values
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(include=['number']).columns

print('Categorical columns:', cat_cols)
print('Numerical of Cloumns:', num_cols)



Categorical columns: Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')
Numerical of Cloumns: Index(['number_of_courses_viewed', 'annual_income', 'interaction_count',
       'lead_score', 'converted'],
      dtype='object')


In [8]:
df[cat_cols] = df[cat_cols].fillna('NA')
df[num_cols] = df[num_cols].fillna(0.0)

print("\nMissing values per column after filling:")
print(df.isnull().sum())


Missing values per column after filling:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [13]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [16]:
print(df.columns)

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')


In [17]:
from sklearn.model_selection import train_test_split

features = ['lead_score', 'number_of_courses_viewed', 'interaction_count', 'annual_income']
target = 'converted'

X = df[features]
y = df[target]

# Split data into train (60%), val (20%), test (20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=1
)

print('Train:', X_train.shape, 'Validation:', X_val.shape, 'Test:', X_test.shape)


Train: (876, 4) Validation: (293, 4) Test: (293, 4)


In [18]:
from sklearn.metrics import roc_auc_score

features = ['lead_score', 'number_of_courses_viewed', 'interaction_count', 'annual_income']

for feature in features:
    auc = roc_auc_score(y_val, X_val[feature])
    print(f'ROC AUC for {feature}: {auc}')


ROC AUC for lead_score: 0.6231904898859169
ROC AUC for number_of_courses_viewed: 0.7422107180519606
ROC AUC for interaction_count: 0.6972006519029814
ROC AUC for annual_income: 0.5543572044866264


In [19]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred_proba)
print(f'Model AUC: {auc}')


Model AUC: 0.8835682101428435


In [20]:
from sklearn.metrics import precision_score, recall_score

y_pred = (y_pred_proba >= 0.5).astype(int)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
print(f'Precision: {precision}')
print(f'Recall: {recall}')


Precision: 0.8097826086956522
Recall: 0.8713450292397661


In [21]:
import numpy as np
from sklearn.metrics import precision_score, recall_score

thresholds = np.arange(0.0, 1.01, 0.01)
precisions = []
recalls = []

for t in thresholds:
    y_pred = (y_pred_proba >= t).astype(int)
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))

# Find threshold where precision and recall are closest
diffs = np.abs(np.array(precisions) - np.array(recalls))
intersection_index = diffs.argmin()
intersection_threshold = thresholds[intersection_index]

print(f"Precision and recall curves intersect at threshold: {intersection_threshold}")


Precision and recall curves intersect at threshold: 0.5700000000000001


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Question 4: F1 score
Precision and recall are conflicting - when one grows, the other goes down. That's why they are often combined into the F1 score - a metrics that takes into account both

This is the formula for computing F1:

Where 
P - is precision and 
R - is recall.
Let's compute F1 for all thresholds from 0.0 to 1.0 with increment 0.01
At which threshold F1 is maximal?


In [23]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

thresholds = np.arange(0, 1.01, 0.01)
f1_scores = []

for t in thresholds:
    y_pred = (y_pred_proba >= t).astype(int)
    f1 = f1_score(y_val, y_pred)
    f1_scores.append(f1)

max_f1 = max(f1_scores)
best_threshold = thresholds[f1_scores.index(max_f1)]

print(f"Max F1 score: {max_f1:.3f} at threshold: {best_threshold:.2f}")


Max F1 score: 0.849 at threshold: 0.44


# Question 5: 5-Fold CV

In [24]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=1)
auc_scores = []

X_full = df[features].to_numpy()
y_full = df[target].to_numpy()

for train_index, val_index in kf.split(X_full):
    X_train, X_val = X_full[train_index], X_full[val_index]
    y_train, y_val = y_full[train_index], y_full[val_index]

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y_val, y_pred_proba)
    auc_scores.append(auc)

std_dev = np.std(auc_scores)
print(f'Standard Deviation of AUC Scores: {std_dev}')


Standard Deviation of AUC Scores: 0.027659854572335166


In [25]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=1)
C_values = [0.000001, 0.001, 1]

X_full = df[features].to_numpy()
y_full = df[target].to_numpy()

results = []

for C in C_values:
    auc_scores = []
    for train_idx, val_idx in kf.split(X_full):
        X_train, X_val = X_full[train_idx], X_full[val_idx]
        y_train, y_val = y_full[train_idx], y_full[val_idx]

        model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred_proba)
        auc_scores.append(auc)

    mean_auc = np.round(np.mean(auc_scores), 3)
    std_auc = np.round(np.std(auc_scores), 3)
    results.append((C, mean_auc, std_auc))

for res in results:
    print(f'C={res[0]} Mean AUC={res[1]} STD={res[2]}')

best_C = max(results, key=lambda x: x[1])[0]
print(f'Best C: {best_C}')


C=1e-06 Mean AUC=0.549 STD=0.031
C=0.001 Mean AUC=0.853 STD=0.025
C=1 Mean AUC=0.807 STD=0.028
Best C: 0.001
