In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
df.head(20)




Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
5,events,manufacturing,1,59904.0,,africa,6,0.83,1
6,social_media,technology,0,51283.0,,middle_east,2,0.57,0
7,social_media,,5,62975.0,student,europe,4,0.62,1
8,referral,healthcare,4,38648.0,unemployed,south_america,2,0.86,1
9,paid_ads,other,3,59866.0,student,australia,3,0.43,1


In [2]:
print(df.dtypes)

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object


In [3]:
datatypes = df.dtypes['lead_source']
print(datatypes)
print(datatypes == 'object')

object
True


In [4]:
# preparation

# For caterogiral features, replace them with 'NA'
# For numerical features, replace with with 0.0

for c in df.columns:
    print(c)

    if df.dtypes[c] == 'object':
        df[c] = df[c].fillna('NA')
    else:
        df[c] = df[c].fillna(0)
        

lead_source
industry
number_of_courses_viewed
annual_income
employment_status
location
interaction_count
lead_score
converted


In [5]:
df.head(20)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
5,events,manufacturing,1,59904.0,,africa,6,0.83,1
6,social_media,technology,0,51283.0,,middle_east,2,0.57,0
7,social_media,,5,62975.0,student,europe,4,0.62,1
8,referral,healthcare,4,38648.0,unemployed,south_america,2,0.86,1
9,paid_ads,other,3,59866.0,student,australia,3,0.43,1


In [6]:
# Q1

ind_mode = df['industry'].mode()
# ind_mode = df[['industry']].mode()
ind_mode

0    retail
Name: industry, dtype: object

In [7]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [8]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_full_train = df_full_train.converted.values
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']


In [9]:
# EDA
df_full_train = df_full_train.reset_index(drop=True)
df_full_train.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [10]:
df_full_train.converted.mean()

np.float64(0.6073567151411463)

In [11]:
df_full_train.converted.value_counts(normalize=True)

converted
1    0.607357
0    0.392643
Name: proportion, dtype: float64

In [12]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']

df_full_train[categorical].nunique()

lead_source          6
industry             8
employment_status    5
location             8
dtype: int64

In [13]:
cm1 = df_full_train[['interaction_count','lead_score']].corr()
print(cm1)

                   interaction_count  lead_score
interaction_count           1.000000    0.025393
lead_score                  0.025393    1.000000


In [14]:
cm2 = df_train[['interaction_count','lead_score']].corr()
print(cm2)

                   interaction_count  lead_score
interaction_count           1.000000    0.011374
lead_score                  0.011374    1.000000


In [15]:
df_full_train[['interaction_count']].corrwith(df_full_train.lead_score).abs()

interaction_count    0.025393
dtype: float64

In [16]:
df_train[['interaction_count']].corrwith(df_train.lead_score).abs()

interaction_count    0.011374
dtype: float64

In [17]:
df_full_train[['number_of_courses_viewed']].corrwith(df_full_train.lead_score).abs()

number_of_courses_viewed    0.009427
dtype: float64

In [18]:
df_train[['number_of_courses_viewed']].corrwith(df_train.lead_score).abs()

number_of_courses_viewed    0.011529
dtype: float64

In [19]:
df_full_train[['number_of_courses_viewed']].corrwith(df_full_train.interaction_count).abs()

number_of_courses_viewed    0.044381
dtype: float64

In [20]:
df_train[['number_of_courses_viewed']].corrwith(df_train.interaction_count).abs()

number_of_courses_viewed    0.050187
dtype: float64

In [21]:
df_train[['annual_income']].corrwith(df_train.interaction_count).abs()

annual_income    0.01551
dtype: float64

In [22]:
df_full_train[['annual_income']].corrwith(df_full_train.interaction_count).abs()

annual_income    0.011959
dtype: float64

In [23]:
# Q3
from sklearn.metrics import mutual_info_score

for c in categorical:
    im = mutual_info_score(df_train[c], y_train)
    print(c, im)


lead_source 0.03539624379726594
industry 0.011574521435657112
employment_status 0.012937677269442782
location 0.004464157884038034


In [24]:
for c in categorical:
    im = mutual_info_score(df_full_train[c], y_full_train)
    print(c, im)


lead_source 0.025665373935054955
industry 0.011684562750165564
employment_status 0.013258496589914293
location 0.0022530354195563346


In [84]:
# Q4
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [85]:
df_train.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,paid_ads,retail,0,58472.0,student,middle_east,5,0.03
1,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77
2,paid_ads,technology,3,81973.0,employed,north_america,2,0.59
3,,technology,1,74956.0,employed,europe,3,0.34
4,organic_search,retail,3,59335.0,student,australia,1,0.98


In [88]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [87]:
X_train[1]

array([7.1738e+04, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 6.0000e+00,
       7.7000e-01, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       3.0000e+00])

In [79]:
X_val

array([[5.2220e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [5.9656e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [5.7134e+04, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [7.4166e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [3.9103e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       [4.7129e+04, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00]])

In [27]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic_regression(xi):
    score = w0
    
    for j in range(len(w)):
        score = score + xi[j] * w[j]
        
    result = sigmoid(score)
    return result

In [74]:
from sklearn.linear_model import LogisticRegression

# model = LogisticRegression(solver='lbfgs', max_iter=3000)
model = LogisticRegression(solver='liblinear', C=0.1, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,100
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [75]:
y_pred = model.predict_proba(X_val)[:, 1]

In [76]:
conversion_decision = (y_pred >= 0.5)

In [77]:
(y_val == conversion_decision).mean()

np.float64(0.6996587030716723)

In [48]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = conversion_decision.astype(int)
df_pred['actual'] = y_val

In [49]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [59]:
current_rate = df_pred.correct.mean()

In [60]:
# Q5

In [63]:
def get_correct_rate(features, current_rate):

    train_dict = df_train[features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    val_dict = df_val[features].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    conversion_decision = (y_pred >= 0.5)

    rate = (y_val == conversion_decision).mean()

    print("The rate is: ", rate)

    return (current_rate-rate)

In [65]:
features = categorical + numerical
features_i = features.copy()
features_i.remove('industry')

print(features_i)
print(get_correct_rate(features_i, current_rate))



['lead_source', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
The rate is:  0.6996587030716723
0.0


In [66]:
features = categorical + numerical
features_e = features.copy()
features_e.remove('employment_status')

print(features_e)
print(get_correct_rate(features_e, current_rate))

['lead_source', 'industry', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
The rate is:  0.6962457337883959
0.0034129692832763903


In [67]:
features = categorical + numerical
features_l = features.copy()
features_l.remove('lead_score')

print(features_l)
print(get_correct_rate(features_l, current_rate))

['lead_source', 'industry', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count']
The rate is:  0.7064846416382252
-0.0068259385665528916


In [68]:
# 'industry'
# 'employment_status'
# 'lead_score'

In [69]:
# Q6

In [72]:
def reg(new):
    
    model = LogisticRegression(solver='liblinear', C=new, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    conversion_decision = (y_pred >= 0.5)
    rate = (y_val == conversion_decision).mean()

    return rate 

In [73]:
cs = [0.01, 0.1, 1, 10, 100]

for c in cs:

    print(reg(c))
    

0.6996587030716723
0.6996587030716723
0.6996587030716723
0.6996587030716723
0.6996587030716723
