In [1]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [4]:
!curl -O $data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80876  100 80876    0     0   127k      0 --:--:-- --:--:-- --:--:--  127k


In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [29]:
df = pd.read_csv('course_lead_scoring.csv')

In [30]:
df .head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [12]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [31]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)

In [32]:
numerical = list(df.dtypes[df.dtypes != 'object'].index)

In [20]:
categorical

['lead_source', 'industry', 'employment_status', 'location']

In [21]:
numerical

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [34]:
df[categorical].isnull().sum()

lead_source          128
industry             134
employment_status    100
location              63
dtype: int64

In [36]:
df[categorical] = df[categorical].fillna('NA')

In [37]:
df[numerical] = df[numerical].fillna(0.0)

In [38]:
df[numerical].isnull().sum()

number_of_courses_viewed    0
annual_income               0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [40]:
df[categorical].isnull().sum()

lead_source          0
industry             0
employment_status    0
location             0
dtype: int64

In [43]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [49]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [62]:
print('interaction_count and lead_score')
df[['interaction_count']].corrwith(df['lead_score']).abs()

interaction_count and lead_score


interaction_count    0.009888
dtype: float64

In [63]:
print('number_of_courses_viewed and lead_score')
df[['number_of_courses_viewed']].corrwith(df['lead_score']).abs()

number_of_courses_viewed and lead_score


number_of_courses_viewed    0.004879
dtype: float64

In [64]:
print('number_of_courses_viewed and interaction_count')
df[['number_of_courses_viewed']].corrwith(df['interaction_count']).abs()

number_of_courses_viewed and interaction_count


number_of_courses_viewed    0.023565
dtype: float64

In [65]:
print('annual_income and interaction_count')
df[['annual_income']].corrwith(df['interaction_count']).abs()

annual_income and interaction_count


annual_income    0.027036
dtype: float64

In [67]:
from sklearn.model_selection import train_test_split

In [82]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [84]:
y_train = df_train['converted']
y_test = df_test['converted']
y_val = df_val['converted']

In [86]:
del df_train['converted']
del df_test['converted']
del df_val['converted']

In [83]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [87]:
len(df_train)

876

In [88]:
len(df_test)

293

In [89]:
len(df_val)

293

In [90]:
len(y_train)

876

In [92]:
from sklearn.metrics import mutual_info_score

In [93]:
categorical

['lead_source', 'industry', 'employment_status', 'location']

In [95]:
for c in categorical:
    print(c)
    print(mutual_info_score(df_train[c],y_train))
    

lead_source
0.03539624379726594
industry
0.011574521435657112
employment_status
0.012937677269442782
location
0.004464157884038034


In [119]:
from sklearn.feature_extraction import DictVectorizer

In [178]:
dv = DictVectorizer(sparse=False)

In [179]:
dicts_train = df_train.to_dict(orient='records')

In [180]:
dv = DictVectorizer(sparse=False)

In [182]:
dv.fit(dicts_train)

In [183]:
X_train = dv.transform(dicts_train)

In [184]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [185]:
from sklearn.linear_model import LogisticRegression

In [186]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [187]:
model.fit(X_train, y_train)

In [188]:
dicts_val = df_val.to_dict(orient='records')

In [189]:
X_val = dv.transform(dicts_val)

In [190]:
y_pred = model.predict_proba(X_val)[:, 1]

In [191]:
convert_decision = y_pred >= 0.5

In [192]:
(convert_decision == y_val).mean()

np.float64(0.6996587030716723)

In [193]:
from sklearn.metrics import accuracy_score

In [194]:
accuracy_score(y_val, y_pred >= 0.5)

0.6996587030716723

In [195]:
round((convert_decision == y_val).mean(),2)

np.float64(0.7)

In [196]:
original_accuracy = (convert_decision == y_val).mean()

In [197]:
original_accuracy

np.float64(0.6996587030716723)

In [199]:
list(df.columns)

['lead_source',
 'industry',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count',
 'lead_score',
 'converted']

In [202]:
without_industry = ['lead_source',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count',
 'lead_score']
without_employment_status = ['lead_source',
 'industry',
 'number_of_courses_viewed',
 'annual_income',
 'location',
 'interaction_count',
 'lead_score']
without_lead_score = ['lead_source',
 'industry',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count']

In [231]:
dv_without_industry = DictVectorizer(sparse=False)
dicts_without_industry_train = df_train[without_industry].to_dict(orient='records')
dicts_without_industry_val = df_val[without_industry].to_dict(orient='records')

In [232]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
X_train_without_industry = dv_without_industry.fit_transform(dicts_without_industry_train)
model.fit(X_train_without_industry , y_train)

In [233]:
X_val_without_industry = dv_without_industry.transform(dicts_without_industry_val)

In [234]:

y_pred_without_industry = model.predict_proba(X_val_without_industry)[:, 1]

In [235]:
accuracy_score_without_industry=accuracy_score(y_val, y_pred_without_industry >= 0.5)

In [236]:
print("**Difference in accuracy without industry**")
abs(accuracy_score_without_industry - original_accuracy)

**Difference in accuracy without industry**


np.float64(0.0)

In [218]:
dv_without_employment_status = DictVectorizer(sparse=False)
dicts_without_employment_status_train = df_train[without_employment_status].to_dict(orient='records')
dicts_without_employment_status_val = df_val[without_employment_status].to_dict(orient='records')

In [219]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
X_train_without_employment_status = dv_without_employment_status.fit_transform(dicts_without_employment_status_train)
model.fit(X_train_without_employment_status, y_train)

In [220]:
X_val_without_employment_status = dv_without_employment_status.transform(dicts_without_employment_status_val)

In [221]:
y_pred_without_employment_status = model.predict_proba(X_val_without_employment_status)[:, 1]

In [222]:
accuracy_score_without_employment_status=accuracy_score(y_val, y_pred_without_employment_status >= 0.5)

In [223]:
print("**Difference in accuracy without employment status**")
abs(accuracy_score_without_employment_status - original_accuracy)

**Difference in accuracy without employment status**


np.float64(0.0034129692832763903)

In [217]:
accuracy_score_without_employment_status

0.6962457337883959

In [225]:
dv_without_lead_score = DictVectorizer(sparse=False)
dicts_without_lead_score_train = df_train[without_lead_score].to_dict(orient='records')
dicts_without_lead_score_val = df_val[without_lead_score].to_dict(orient='records')

In [226]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
X_train_without_lead_score = dv_without_lead_score.fit_transform(dicts_without_lead_score_train)
model.fit(X_train_without_lead_score, y_train)

In [227]:
X_val_without_lead_score = dv_without_lead_score.transform(dicts_without_lead_score_val)

In [228]:
y_pred_without_lead_score = model.predict_proba(X_val_without_lead_score)[:, 1]

In [229]:
accuracy_score_without_lead_score=accuracy_score(y_val, y_pred_without_lead_score >= 0.5)

In [230]:
print("**Difference in accuracy without lead score**")
abs(accuracy_score_without_lead_score - original_accuracy)

**Difference in accuracy without lead score**


np.float64(0.0068259385665528916)

In [249]:
from sklearn.metrics import accuracy_score
features = ['lead_source',
 'industry',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count',
 'lead_score']
for c in [0.01, 0.1, 1, 10, 100]:
    dv = DictVectorizer(sparse=False)
    dicts_train = df_train[features].to_dict(orient='records')
    dicts_val = df_val[features].to_dict(orient='records')
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    X_train = dv.fit_transform(dicts_train)
    model.fit(X_train, y_train)
    X_val = dv.transform(dicts_val)
    y_pred = model.predict_proba(X_val)[:, 1]
    final_accuracy_score = accuracy_score(y_val, y_pred >= 0.5)
    print("C -> %f, accuracy %f" % (c,round(final_accuracy_score,3)))

C -> 0.010000, accuracy 0.700000
C -> 0.100000, accuracy 0.700000
C -> 1.000000, accuracy 0.700000
C -> 10.000000, accuracy 0.700000
C -> 100.000000, accuracy 0.700000
