In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-13 17:39:38--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-13 17:39:38 (7.33 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

df = pd.read_csv("course_lead_scoring.csv")
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df.isnull().sum()

Unnamed: 0,0
lead_source,128
industry,134
number_of_courses_viewed,0
annual_income,181
employment_status,100
location,63
interaction_count,0
lead_score,0
converted,0


Prepare Data

In [16]:
null_object = ['lead_source', 'industry', 'employment_status', 'location']
null_float = ['annual_income']
df[null_object] = df[null_object].fillna('NA')
df[null_float] = df[null_float].fillna(0.0)

Q1

In [21]:
df['industry'].mode()[0]

'retail'

Q2

In [26]:
num_cols = ['interaction_count', 'lead_score', 'number_of_courses_viewed', 'annual_income']
corr_matrix = df[num_cols].corr()
print(corr_matrix)

                          interaction_count  lead_score  \
interaction_count                  1.000000    0.009888   
lead_score                         0.009888    1.000000   
number_of_courses_viewed          -0.023565   -0.004879   
annual_income                      0.027036    0.015610   

                          number_of_courses_viewed  annual_income  
interaction_count                        -0.023565       0.027036  
lead_score                               -0.004879       0.015610  
number_of_courses_viewed                  1.000000       0.009770  
annual_income                             0.009770       1.000000  


Split the data

In [27]:
from sklearn.model_selection import train_test_split


X = df.drop('converted', axis=1)
y = df['converted']


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

0.5998632010943913 0.1997264021887825 0.20041039671682626


Q3

In [29]:
from sklearn.metrics import mutual_info_score

cat_features = ['industry', 'location', 'lead_source', 'employment_status']

for feature in cat_features:
    score = mutual_info_score(X_train[feature], y_train)
    print(f"{feature}: {round(score, 2)}")


industry: 0.02
location: 0.0
lead_source: 0.03
employment_status: 0.02


Q4

In [32]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

categorical = ['industry', 'location', 'lead_source', 'employment_status']
numerical = ['interaction_count', 'number_of_courses_viewed', 'annual_income', 'lead_score']

dv = DictVectorizer(sparse=False)

train_dict = X_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = X_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [33]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [36]:
from sklearn.metrics import accuracy_score

y_pred_val = model.predict(X_val)

accuracy = round(accuracy_score(y_val, y_pred_val), 2)

print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.74


Q5

In [45]:
all_features = categorical + numerical

original_acc = accuracy

X_train_df = df.loc[y_train.index, all_features]
X_val_df   = df.loc[y_val.index, all_features]

results = {}

for feature in all_features:
    remaining = [f for f in all_features if f != feature]
    dv_temp = DictVectorizer(sparse=False)

    X_train_sub = dv_temp.fit_transform(X_train_df[remaining].to_dict(orient='records'))
    X_val_sub   = dv_temp.transform(X_val_df[remaining].to_dict(orient='records'))

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_sub, y_train)

    y_pred_val = model.predict(X_val_sub)
    acc = accuracy_score(y_val, y_pred_val)
    results[feature] = round(original_acc - acc, 4)

In [46]:
keys = ['industry', 'lead_source', 'employment_status']
sub = {k: results[k] for k in keys if k in results}

for feature, diff in sub.items():
    print(f"{feature}: difference = {diff}")

industry: difference = -0.0032
lead_source: difference = 0.0105
employment_status: difference = -0.0066


Q6

In [54]:
X_train_enc = dv.fit_transform(X_train_df.to_dict(orient='records'))
X_val_enc   = dv.transform(X_val_df.to_dict(orient='records'))

Cs = [0.01, 0.1, 1, 10, 100]
accs = {}

for C in Cs:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_enc, y_train)
    y_pred_val = model.predict(X_val_enc)
    acc = accuracy_score(y_val, y_pred_val)
    accs[C] = round(acc, 4)

for C, a in accs.items():
    print(f"C={C}: val accuracy = {a}")

C=0.01: val accuracy = 0.7432
C=0.1: val accuracy = 0.7432
C=1: val accuracy = 0.7432
C=10: val accuracy = 0.7432
C=100: val accuracy = 0.7432
