In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [5]:
categorical = ['location', 'employment_status', 'lead_source', 'industry']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [6]:
# Fill missing values
df[categorical] = df[categorical].fillna('NA')   # categorical → 'NA'
df[numerical]   = df[numerical].fillna(0.0)      # numerical → 0.0


In [7]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
df_train_full, df_test = train_test_split(df, test_size=0.25, random_state= 42)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=42)

y_train = df_train.converted.values
y_val = df_val.converted.values

In [10]:
del df_train['converted']
del df_val['converted']

In [11]:
global_conversion = df_train_full.converted.mean()
global_conversion

0.6049270072992701

In [12]:
from sklearn.metrics import mutual_info_score

In [13]:
df_train_full[categorical] = df_train_full[categorical].fillna('NaN')
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.converted)      
df_mi = df_train_full[categorical].apply(calculate_mi)   
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')   
df_mi

Unnamed: 0,MI
lead_source,0.026506
employment_status,0.01423
industry,0.011724
location,0.00352


In [14]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')
val_dict = df_val[categorical + numerical].to_dict(orient='records')

In [15]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)
X_val= dv.transform(val_dict)

In [16]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [17]:
model.predict_proba(X_val)

array([[0.10673967, 0.89326033],
       [0.37619142, 0.62380858],
       [0.53489082, 0.46510918],
       [0.49218551, 0.50781449],
       [0.21997001, 0.78002999],
       [0.290135  , 0.709865  ],
       [0.19635677, 0.80364323],
       [0.16519765, 0.83480235],
       [0.48534246, 0.51465754],
       [0.52484825, 0.47515175],
       [0.04899321, 0.95100679],
       [0.29202594, 0.70797406],
       [0.55179413, 0.44820587],
       [0.63637282, 0.36362718],
       [0.34075268, 0.65924732],
       [0.18940397, 0.81059603],
       [0.73510634, 0.26489366],
       [0.14808097, 0.85191903],
       [0.24228811, 0.75771189],
       [0.18570711, 0.81429289],
       [0.67187972, 0.32812028],
       [0.29168334, 0.70831666],
       [0.2559343 , 0.7440657 ],
       [0.52519066, 0.47480934],
       [0.10948868, 0.89051132],
       [0.6883047 , 0.3116953 ],
       [0.32973638, 0.67026362],
       [0.48194484, 0.51805516],
       [0.48828909, 0.51171091],
       [0.49444977, 0.50555023],
       [0.

In [33]:
y_pred = model.predict_proba(X_val)[:, 1]

In [34]:
converted = y_pred > 0.5 
(y_val == converted).mean()

0.7348066298342542

In [37]:
from sklearn.metrics import roc_auc_score, roc_curve

In [38]:
auc_val = roc_auc_score(y_val, y_pred)
print("Validation AUC:", auc_val)

Validation AUC: 0.8250937391687935


In [46]:
thresholds = np.arange(0.0, 1.01, 0.01)  # 0.00 to 1.00 inclusive
results = []

for t in thresholds:
    y_pred = (y_pred >= t).astype(int)  # apply threshold
    precision = precision_score(y_val, y_pred, zero_division=0)
    recall = recall_score(y_val, y_pred, zero_division=0)
    auc = roc_auc_score(y_val, y_pred)  # AUC of the hard classification at this threshold
    results.append((t, precision, recall, auc))

# Display the results
for t, p, r, a in results:
    print(f"Threshold={t:.2f} | Precision={p:.4f} | Recall={r:.4f} | AUC={a:.4f}")

Threshold=0.00 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.01 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.02 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.03 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.04 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.05 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.06 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.07 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.08 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.09 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.10 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.11 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.12 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.13 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.14 | Precision=0.5884 | Recall=1.0000 | AUC=0.5000
Threshold=0.15 | Precision=0.5884 | Recall=1.0000 | AUC

In [41]:
precision

0.7052631578947368

In [47]:
from sklearn.metrics import precision_score, recall_score, f1_score

thresholds = [i/100 for i in range(0, 101)]

for t in thresholds:
    y_pred = (y_pred >= t).astype(int)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print(f"Threshold={t:.2f} | Precision={precision:.4f} | Recall={recall:.4f} | F1={f1:.4f}")


Threshold=0.00 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.01 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.02 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.03 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.04 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.05 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.06 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.07 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.08 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.09 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.10 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.11 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.12 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.13 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.14 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshold=0.15 | Precision=0.5884 | Recall=1.0000 | F1=0.7409
Threshol

In [50]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)

scores = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]  # ✅ no iloc

    model = LogisticRegression()
    model.fit(X_train_fold, y_train_fold)

    y_pred = model.predict_proba(X_val_fold)[:, 1]
    auc = roc_auc_score(y_val_fold, y_pred)
    scores.append(auc)

print("Scores:", scores)
print("Mean AUC:", np.mean(scores))
print("Standard Deviation:", np.std(scores))


Scores: [0.8871647509578543, 0.9159311562224184, 0.8990253411306043, 0.898497688751926, 0.893483231707317]
Mean AUC: 0.8988204337540239
Standard Deviation: 0.009563091988390284


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

In [52]:
from sklearn.model_selection import KFold, cross_val_score

C_values = [0.000001, 0.001, 1]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    scores = cross_val_score(model, X_train, y_train, cv=kf)
    print(f"C={C}  Mean={scores.mean():.3f}  Std={scores.std():.3f}")


C=1e-06  Mean=0.613  Std=0.051
C=0.001  Mean=0.621  Std=0.055
C=1  Mean=0.745  Std=0.032
