In [1]:
import pandas as pd
import numpy as np

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [3]:
!wget $data

--2025-10-13 19:56:34--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.2’


2025-10-13 19:56:34 (49.6 MB/s) - ‘course_lead_scoring.csv.2’ saved [80876/80876]



In [4]:
df = pd.read_csv(data)

In [5]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [6]:
df.isnull()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,False,True,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
1457,False,False,False,True,False,False,False,False,False
1458,False,False,False,False,False,False,False,False,False
1459,False,False,False,False,False,False,False,False,False
1460,False,True,False,False,False,False,False,False,False


In [7]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [8]:
categorical_features = ['lead_source', 'industry', 'employment_status', 'location']
numerical_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [9]:
# Replace missing values for categorical features with 'NA'
for feature in categorical_features:
    df[feature] = df[feature].fillna('NA')
    

In [10]:
# Replace missing values for numerical features with 0.0
for feature in numerical_features:
    df[feature] = df[feature].fillna(0.0)

In [11]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [12]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [13]:
df.shape

(1462, 9)

In [14]:
df['industry'].mode()[0]

'retail'

In [15]:
correlation_matrix = df[numerical_features].corr()
correlation_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [16]:
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

In [17]:
correlations = {}
for feat1, feat2 in pairs:
    corr_value = correlation_matrix.loc[feat1, feat2]
    correlations[f"{feat1} & {feat2}"] = corr_value

In [18]:
correlations

{'interaction_count & lead_score': np.float64(0.009888182496913131),
 'number_of_courses_viewed & lead_score': np.float64(-0.004878998354681276),
 'number_of_courses_viewed & interaction_count': np.float64(-0.023565222882888037),
 'annual_income & interaction_count': np.float64(0.02703647240481443)}

In [19]:
max_pair = max(correlations, key=correlations.get)
max_corr = correlations[max_pair]
max_corr

np.float64(0.02703647240481443)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [22]:
df_full_train.converted

1066    0
638     1
799     1
380     0
303     1
       ..
1130    0
1294    0
860     0
1459    1
1126    0
Name: converted, Length: 1169, dtype: int64

In [23]:
from sklearn.metrics import mutual_info_score

In [24]:
# mi_scores = {}
# for feature in categorical_features:
    # Calculate MI score on training set
 #   mi_score = mutual_info_score(df_full_train.converted, df_full_train[feature])
    #mi_scores[feature] = round(mi_score, 2)
    # print(f"{feature}: {mi_scores[feature]}")
mutual_info_score(df_full_train.converted, df_full_train.lead_score)



0.06889924779118137

In [25]:
mutual_info_score(df_full_train.converted, df_full_train.industry)

0.011684562750165564

In [26]:
mutual_info_score(df_full_train.converted, df_full_train.employment_status)

0.013258496589914293

In [27]:
mutual_info_score(df_full_train.converted, df_full_train.location)

0.0022530354195563346

In [28]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [29]:
df_full_train[categorical_features].apply(mutual_info_converted_score)

lead_source          0.025665
industry             0.011685
employment_status    0.013258
location             0.002253
dtype: float64