In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-11 20:24:37--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-11 20:24:37 (70.6 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



### Missing values and frequent observation

In [2]:
df = pd.read_csv('course_lead_scoring.csv')

In [3]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1452,1453,1454,1455,1456,1457,1458,1459,1460,1461
lead_source,paid_ads,social_media,events,paid_ads,referral,events,social_media,social_media,referral,paid_ads,...,organic_search,paid_ads,referral,referral,social_media,referral,referral,paid_ads,referral,organic_search
industry,,retail,healthcare,retail,education,manufacturing,technology,,healthcare,other,...,retail,education,finance,finance,healthcare,manufacturing,technology,technology,,finance
number_of_courses_viewed,1,1,5,2,3,1,0,5,4,3,...,1,2,0,2,1,1,3,1,5,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0,59904.0,51283.0,62975.0,38648.0,59866.0,...,49154.0,65742.0,75546.0,,45253.0,,65259.0,45688.0,71016.0,92855.0
employment_status,unemployed,employed,unemployed,,self_employed,,,student,unemployed,student,...,student,employed,,,,self_employed,student,student,self_employed,student
location,south_america,south_america,australia,australia,europe,africa,middle_east,europe,south_america,australia,...,africa,australia,asia,south_america,australia,north_america,europe,north_america,north_america,north_america
interaction_count,4,1,3,1,3,6,2,4,2,3,...,4,0,0,3,6,4,2,3,0,3
lead_score,0.94,0.8,0.69,0.87,0.62,0.83,0.57,0.62,0.86,0.43,...,0.37,0.57,0.1,0.47,0.24,0.53,0.24,0.02,0.25,0.41
converted,1,0,1,0,1,1,0,1,1,1,...,1,0,0,1,1,1,1,1,1,1


In [6]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [5]:
df.lead_source = df.lead_source.fillna('NA')
df.industry = df.industry.fillna('NA')
df.employment_status = df.employment_status.fillna('NA')
df.location = df.location.fillna('NA')
df.annual_income = df.annual_income.fillna(0.0)

In [7]:
df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [8]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [11]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [13]:
df[['interaction_count', 'lead_score']].corr()

Unnamed: 0,interaction_count,lead_score
interaction_count,1.0,0.009888
lead_score,0.009888,1.0


In [14]:
df[['number_of_courses_viewed', 'lead_score']].corr()

Unnamed: 0,number_of_courses_viewed,lead_score
number_of_courses_viewed,1.0,-0.004879
lead_score,-0.004879,1.0


In [15]:
df[['number_of_courses_viewed', 'interaction_count']].corr()

Unnamed: 0,number_of_courses_viewed,interaction_count
number_of_courses_viewed,1.0,-0.023565
interaction_count,-0.023565,1.0


In [16]:
df[['annual_income', 'interaction_count']].corr()

Unnamed: 0,annual_income,interaction_count
annual_income,1.0,0.027036
interaction_count,0.027036,1.0


In [18]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [19]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [25]:
categorical = list(df_train.dtypes[df_train.dtypes == 'object'].index)

In [27]:
for c in categorical:
    val = mutual_info_score(df_train[c], y_train)
    print(f'{c} - {round(val,2)}')

lead_source - 0.04
industry - 0.01
employment_status - 0.01
location - 0.0


### Question 4

In [50]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [51]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [52]:
# to match HW values the accuracy has to be obtained on train data
y_pred = model.predict_proba(X_val)[:, 1]
converted_decision = (y_pred >= 0.5)
acc_full = (y_val == converted_decision).mean()
acc_full

np.float64(0.6996587030716723)

In [48]:
feats = categorical + numerical
for f in feats:
    feat_list = [x for x in feats if x != f]
    train_dict = df_train[feat_list].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    val_dict = df_val[feat_list].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    converted_decision = (y_pred >= 0.5)
    acc = (y_val == converted_decision).mean()
    print(f'Without {f} - the diff is {round(acc_full-acc,6)}')

Without lead_source - the diff is -0.003413
Without industry - the diff is 0.0
Without employment_status - the diff is 0.003413
Without location - the diff is -0.010239
Without number_of_courses_viewed - the diff is 0.143345
Without annual_income - the diff is -0.153584
Without interaction_count - the diff is 0.143345
Without lead_score - the diff is -0.006826


### Question 6

In [55]:
C_values = [0.01, 0.1, 1, 10, 100]

for c_val in C_values:
    train_dict = df_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    val_dict = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    model = LogisticRegression(solver='liblinear', C=c_val, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    converted_decision = (y_pred >= 0.5)
    acc = (y_val == converted_decision).mean()
    print(f'{c_val} - {round(acc,9)}')

0.01 - 0.699658703
0.1 - 0.699658703
1 - 0.699658703
10 - 0.699658703
100 - 0.699658703
