In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('course_lead_scoring.csv')

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [6]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)
categorical

['lead_source', 'industry', 'employment_status', 'location']

In [7]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [8]:
df[numerical]

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
0,1,79450.0,4,0.94
1,1,46992.0,1,0.80
2,5,78796.0,3,0.69
3,2,83843.0,1,0.87
4,3,85012.0,3,0.62
...,...,...,...,...
1457,1,,4,0.53
1458,3,65259.0,2,0.24
1459,1,45688.0,3,0.02
1460,5,71016.0,0,0.25


In [9]:
for c in categorical:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [10]:
df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [11]:
df[numerical]=df[numerical].fillna(0)

In [12]:
df[categorical]=df[categorical].fillna('No Value')

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [15]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [18]:
del df_train['lead_score']
del df_val['lead_score']
del df_test['lead_score']

In [19]:
df_full_train.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [20]:
df_full_train.converted.value_counts(normalize=True)

converted
1    0.611634
0    0.388366
Name: proportion, dtype: float64

In [21]:
from IPython.display import display

In [22]:
global_conversion_count = df['converted'].mean()

In [23]:
global_conversion_count

0.6190150478796169

In [24]:
combined_list = numerical + categorical
for c in combined_list:
    print(c)
    df_group = df.groupby(c).converted.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_conversion_count
    df_group['risk'] = df_group['mean'] / global_conversion_count
    display(df_group)
    print()

number_of_courses_viewed


Unnamed: 0_level_0,mean,count,diff,risk
number_of_courses_viewed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.243094,181,-0.375921,0.392711
1,0.472422,417,-0.146593,0.763183
2,0.621134,388,0.002119,1.003423
3,0.836431,269,0.217416,1.351229
4,0.926606,109,0.30759,1.496903
5,0.985075,67,0.36606,1.591358
6,1.0,22,0.380985,1.61547
7,1.0,6,0.380985,1.61547
8,1.0,2,0.380985,1.61547
9,1.0,1,0.380985,1.61547



annual_income


Unnamed: 0_level_0,mean,count,diff,risk
annual_income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.60221,181,-0.016805,0.972852
13929.0,0.00000,1,-0.619015,0.000000
16132.0,0.00000,1,-0.619015,0.000000
16351.0,1.00000,1,0.380985,1.615470
17735.0,1.00000,1,0.380985,1.615470
...,...,...,...,...
101908.0,1.00000,1,0.380985,1.615470
102742.0,0.00000,1,-0.619015,0.000000
102855.0,0.00000,1,-0.619015,0.000000
104863.0,1.00000,1,0.380985,1.615470



interaction_count


Unnamed: 0_level_0,mean,count,diff,risk
interaction_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.333333,75,-0.285682,0.53849
1,0.368421,209,-0.250594,0.595173
2,0.4875,320,-0.131515,0.787541
3,0.650146,343,0.031131,1.050291
4,0.750916,273,0.131901,1.213082
5,0.854015,137,0.235,1.379635
6,0.966102,59,0.347087,1.560708
7,0.969697,33,0.350682,1.566516
8,1.0,8,0.380985,1.61547
9,1.0,3,0.380985,1.61547



lead_score


Unnamed: 0_level_0,mean,count,diff,risk
lead_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.00,0.555556,9,-0.063459,0.897483
0.01,0.600000,15,-0.019015,0.969282
0.02,0.380952,21,-0.238063,0.615417
0.03,0.294118,17,-0.324897,0.475138
0.04,0.285714,14,-0.333301,0.461563
...,...,...,...,...
0.96,0.705882,17,0.086867,1.140331
0.97,0.750000,16,0.130985,1.211602
0.98,0.700000,20,0.080985,1.130829
0.99,0.928571,14,0.309556,1.500079



lead_source


Unnamed: 0_level_0,mean,count,diff,risk
lead_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No Value,0.671875,128,0.05286,1.085394
events,0.596,250,-0.023015,0.96282
organic_search,0.617021,282,-0.001994,0.996779
paid_ads,0.44697,264,-0.172045,0.722066
referral,0.807692,260,0.188677,1.304802
social_media,0.604317,278,-0.014699,0.976255



industry


Unnamed: 0_level_0,mean,count,diff,risk
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No Value,0.559701,134,-0.059314,0.904181
education,0.748663,187,0.129648,1.209442
finance,0.595,200,-0.024015,0.961204
healthcare,0.604278,187,-0.014737,0.976193
manufacturing,0.666667,174,0.047652,1.07698
other,0.611111,198,-0.007904,0.987231
retail,0.586207,203,-0.032808,0.946999
technology,0.569832,179,-0.049183,0.920547



employment_status


Unnamed: 0_level_0,mean,count,diff,risk
employment_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No Value,0.56,100,-0.059015,0.904663
employed,0.689024,328,0.070009,1.113098
self_employed,0.653409,352,0.034394,1.055563
student,0.652299,348,0.033284,1.053769
unemployed,0.497006,334,-0.122009,0.802898



location


Unnamed: 0_level_0,mean,count,diff,risk
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No Value,0.698413,63,0.079398,1.128264
africa,0.601064,188,-0.017951,0.971
asia,0.620513,195,0.001498,1.00242
australia,0.605405,185,-0.01361,0.978014
europe,0.652778,216,0.033763,1.054543
middle_east,0.631313,198,0.012298,1.019867
north_america,0.595556,225,-0.023459,0.962102
south_america,0.598958,192,-0.020057,0.967599





In [25]:
from sklearn.metrics import mutual_info_score

In [30]:
def mutual_info_lead_score(series):
    return mutual_info_score(series, df_train.converted)

In [31]:
mi = df_train[categorical].apply(mutual_info_lead_score)
mi.sort_values(ascending=False)

lead_source          0.024803
employment_status    0.016345
industry             0.006161
location             0.001453
dtype: float64

In [32]:
corr = df[numerical].corrwith(df.lead_score).abs()
corr.sort_values(ascending=False)

lead_score                  1.000000
annual_income               0.015610
interaction_count           0.009888
number_of_courses_viewed    0.004879
dtype: float64

In [33]:
corr = df[numerical].corrwith(df.interaction_count).abs()
corr.sort_values(ascending=False)

interaction_count           1.000000
annual_income               0.027036
number_of_courses_viewed    0.023565
lead_score                  0.009888
dtype: float64

In [34]:
from sklearn.feature_extraction import DictVectorizer

In [35]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [39]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [40]:
model.fit(X_train, y_train)

In [41]:
model.coef_[0].round(2)

array([-0.46,  0.51,  0.26,  0.42, -0.44, -0.19,  0.71, -0.06,  0.01,
        0.12, -0.09, -0.11, -0.1 ,  0.17,  0.13,  0.05, -0.77,  0.88,
       -0.17,  0.39,  0.02,  0.03, -0.19,  0.01, -0.02, -0.04,  0.09])

In [42]:
y_pred = model.predict_proba(X_val)[:, 1]

In [43]:
convert_decision = (y_pred >= 0.5)

In [44]:
(y_val == convert_decision).mean()

0.6177474402730375

In [45]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = convert_decision.astype(int)
df_pred['actual'] = y_val

In [46]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [47]:
df_pred.correct.mean()

0.6177474402730375

In [49]:
accuracy = accuracy_score(y_val, convert_decision)
accuracy

0.6177474402730375

In [50]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(2)))

{'employment_status=No Value': -0.46,
 'employment_status=employed': 0.51,
 'employment_status=self_employed': 0.26,
 'employment_status=student': 0.42,
 'employment_status=unemployed': -0.44,
 'industry=No Value': -0.19,
 'industry=education': 0.71,
 'industry=finance': -0.06,
 'industry=healthcare': 0.01,
 'industry=manufacturing': 0.12,
 'industry=other': -0.09,
 'industry=retail': -0.11,
 'industry=technology': -0.1,
 'lead_source=No Value': 0.17,
 'lead_source=events': 0.13,
 'lead_source=organic_search': 0.05,
 'lead_source=paid_ads': -0.77,
 'lead_source=referral': 0.88,
 'lead_source=social_media': -0.17,
 'location=No Value': 0.39,
 'location=africa': 0.02,
 'location=asia': 0.03,
 'location=australia': -0.19,
 'location=europe': 0.01,
 'location=middle_east': -0.02,
 'location=north_america': -0.04,
 'location=south_america': 0.09}

In [51]:
categorical

['lead_source', 'industry', 'employment_status', 'location']

In [52]:
categorical_minus_industry = ['lead_source', 'employment_status', 'location']

In [53]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_minus_industry].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_minus_industry].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [54]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [55]:
model.fit(X_train,y_train)

In [56]:
y_pred = model.predict_proba(X_val)[:, 1]

In [57]:
convert_decision = (y_pred >= 0.5)

In [58]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = convert_decision.astype(int)
df_pred['actual'] = y_val

In [59]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [60]:
df_pred.correct.mean()

0.6109215017064846

In [61]:
categorical_minus_empStatus = ['lead_source', 'industry', 'location']

In [62]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_minus_empStatus].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_minus_empStatus].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [63]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [64]:
model.fit(X_train,y_train)

In [65]:
y_pred = model.predict_proba(X_val)[:, 1]

In [66]:
convert_decision = (y_pred >= 0.5)

In [68]:
(y_val == convert_decision).mean()

0.5972696245733788

In [69]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [70]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.coef_[0].round(2)
y_pred = model.predict_proba(X_val)[:, 1]
convert_decision = (y_pred >= 0.5)
(y_val == convert_decision).mean()

0.6177474402730375

In [71]:
model = LogisticRegression(solver='liblinear', C=0.01, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.coef_[0].round(2)
y_pred = model.predict_proba(X_val)[:, 1]
convert_decision = (y_pred >= 0.5)
(y_val == convert_decision).mean()

0.6143344709897611

In [72]:
model = LogisticRegression(solver='liblinear', C=0.1, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.coef_[0].round(2)
y_pred = model.predict_proba(X_val)[:, 1]
convert_decision = (y_pred >= 0.5)
(y_val == convert_decision).mean()

0.6279863481228669

In [73]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.coef_[0].round(2)
y_pred = model.predict_proba(X_val)[:, 1]
convert_decision = (y_pred >= 0.5)
(y_val == convert_decision).mean()

0.621160409556314

In [74]:
model = LogisticRegression(solver='liblinear', C=100, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.coef_[0].round(2)
y_pred = model.predict_proba(X_val)[:, 1]
convert_decision = (y_pred >= 0.5)
(y_val == convert_decision).mean()

0.621160409556314