In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv


--2025-10-12 20:36:24--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-12 20:36:24 (3.23 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [29]:
import pandas as pd

In [30]:
df = pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [31]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [33]:
for col in df.columns:
     if df[col].dtype == 'object':
         df[col] = df[col].fillna('NA')
     else:
         df[col] = df[col].fillna(0.0)

In [34]:
# Question 1
industry_mode = df.industry.mode()
industry_mode
# df.industry.value_counts()

0    retail
Name: industry, dtype: object

In [35]:
# Question 2
# df.dtypes
df_numerical_features = df.select_dtypes(include=['float64', 'int64'])
df_numerical_features.head()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
0,1,79450.0,4,0.94,1
1,1,46992.0,1,0.8,0
2,5,78796.0,3,0.69,1
3,2,83843.0,1,0.87,0
4,3,85012.0,3,0.62,1


In [36]:
correlation_matrix = df_numerical_features.corr()
correlation_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [37]:
# annual_income and interaction_count have the biggest correlation which is: 0.027036

In [38]:
from sklearn.model_selection import train_test_split


In [39]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)


(876, 293, 293)

In [40]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [41]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [42]:
# Question 3
from sklearn.metrics import mutual_info_score


In [43]:
list(df_full_train.columns)

['lead_source',
 'industry',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count',
 'lead_score',
 'converted']

In [44]:
cat_variables = ['industry', 'location', 'lead_source', 'employment_status']

mi_scores_dict = {}
for category in cat_variables:
    mi = mutual_info_score(df_full_train.converted, df_full_train[category])
    mi_scores_dict[category] = round(mi, 2)

mi_scores_dict

{'industry': 0.01,
 'location': 0.0,
 'lead_source': 0.03,
 'employment_status': 0.01}

In [45]:
# Question 4
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer


In [46]:
categorical = []
numerical = []

for col in df_full_train.columns:
    if col == 'converted':
        continue
    elif df_full_train[col].dtypes == 'object':
        categorical.append(col)
    else:
        numerical.append(col)
        
print(categorical)
print()
print()
print(numerical)

df_full_train[categorical].nunique()

['lead_source', 'industry', 'employment_status', 'location']


['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


lead_source          6
industry             8
employment_status    5
location             8
dtype: int64

In [47]:
# One-hot encoding
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [48]:
# Train logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [49]:
model.intercept_[0]


np.float64(-0.06914728027834098)

In [50]:
from sklearn.metrics import accuracy_score

In [51]:
y_val_pred = model.predict(X_val)

val_accuracy = round(accuracy_score(y_val, y_val_pred), 2)
val_accuracy

0.7

In [52]:
# Question 5
import numpy as np

In [53]:
dv_2 = DictVectorizer(sparse=False)

train_dict_2 = df_train[categorical + numerical].to_dict(orient='records')
X_train_2 = dv_2.fit_transform(train_dict_2)

val_dict_2 = df_val[categorical + numerical].to_dict(orient='records')
X_val_2 = dv_2.transform(val_dict_2)

In [54]:
feature_names = dv_2.get_feature_names_out()
feature_names

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [55]:
# Train logistic regression model
model_2 = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_2.fit(X_train_2, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [57]:
y_val__full_pred = model_2.predict(X_val_2)
val_full_accuracy = accuracy_score(y_val, y_val__full_pred)
val_full_accuracy

0.6996587030716723

In [59]:
exclude_categories = ['industry', 'employment_status', 'lead_score']

results = {}

for feature in exclude_categories:
    # Find columns related to the current feature
    cols_to_exclude = [i for i, fname in enumerate(feature_names) if fname.startswith(feature + '=') or fname == feature]

    # Drop those columns from train and val
    X_train_excl = np.delete(X_train_2, cols_to_exclude, axis=1)
    X_val_excl = np.delete(X_val_2, cols_to_exclude, axis=1)

    # Train the model without the feature
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_excl, y_train)

    # Predict and calculate accuracy
    y_val_pred_excl = model.predict(X_val_excl)
    acc = accuracy_score(y_val, y_val_pred_excl)

    # Store the difference
    results[feature] = val_full_accuracy - acc


In [60]:
# Which feature has the smallest difference?
least_useful_feature = min(results, key=results.get)
print("Least useful feature:", least_useful_feature)
print("Accuracy differences:", results)

Least useful feature: lead_score
Accuracy differences: {'industry': 0.0, 'employment_status': 0.0034129692832763903, 'lead_score': -0.0068259385665528916}


In [66]:
# Question 6
dv_3 = DictVectorizer(sparse=False)

train_dict_3 = df_train[categorical + numerical].to_dict(orient='records')
X_train_3 = dv_3.fit_transform(train_dict_3)

val_dict_3 = df_val[categorical + numerical].to_dict(orient='records')
X_val_3 = dv_3.transform(val_dict_3)

In [67]:
C_values = [0.01, 0.1, 1, 10, 100]

In [71]:
results = {}
for C in C_values:
    model_3 = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_3.fit(X_train_3, y_train)
    y_val_pred_3 = model.predict(X_val_3)
    acc = accuracy_score(y_val, y_val_pred_3)
    acc_rounded = round(acc, 3)
    results[C] = acc_rounded
    print(f"C={C}: Validation Accuracy = {acc_rounded}")

C=0.01: Validation Accuracy = 0.7
C=0.1: Validation Accuracy = 0.7
C=1: Validation Accuracy = 0.7
C=10: Validation Accuracy = 0.7
C=100: Validation Accuracy = 0.7
