In [1]:
import os
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
if not os.path.exists('./course_lead_scoring.csv'):
    !wget $data
!ls

WA_Fn-UseC_-Telco-Customer-Churn.csv  churn-prediction.ipynb   data.csv
car_price_new.ipynb		      course_lead_scoring.csv  homework.ipynb


In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('./course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [5]:
numerical = list(df.select_dtypes(include=['int64', 'float64']))[:-1]
categorical = list(df.select_dtypes(include=['object']))

In [6]:
df[numerical] = df[numerical].fillna(0)
df[categorical] = df[categorical].fillna('NA')

In [7]:
df.industry.mode()

0    retail
Name: industry, dtype: object

In [8]:
df[['interaction_count','number_of_courses_viewed']].corrwith(df.lead_score)

interaction_count           0.009888
number_of_courses_viewed   -0.004879
dtype: float64

In [9]:
df[['annual_income','number_of_courses_viewed']].corrwith(df.interaction_count)

annual_income               0.027036
number_of_courses_viewed   -0.023565
dtype: float64

In [10]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [11]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [12]:
from sklearn.metrics import mutual_info_score

def mutual_info_converted_score(series):
    mi = mutual_info_score(series, y_train)
    return round(mi, 2)

mi = df_train[categorical].apply(mutual_info_converted_score)
mi.sort_values(ascending=False)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_cat = ohe.fit_transform(df_train[categorical])
X_train_num = df_train[numerical].values
X_train = np.column_stack([X_train_cat, X_train_num])

In [14]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [15]:
model.intercept_

array([-0.06914728])

In [16]:
model.coef_

array([[ 2.01511698e-02, -1.20346284e-02, -1.16021521e-02,
        -1.15251880e-01,  7.95303436e-02, -2.99401329e-02,
        -2.48510995e-02,  4.93604222e-02, -2.01258344e-02,
        -1.34214865e-02, -3.00232200e-03, -9.25991830e-03,
        -3.17957304e-02, -1.60513114e-02, -1.47154423e-02,
         3.39095225e-02,  2.66248432e-03,  1.15238518e-02,
        -1.02527697e-01,  3.95843295e-03, -1.14296944e-02,
        -1.12457415e-02, -5.59987025e-03,  8.26402635e-03,
         5.58598769e-03, -3.33967159e-02, -2.52837052e-02,
         4.53752887e-01, -1.77843869e-05,  3.11339155e-01,
         5.12012528e-02]])

In [17]:
X_val_cat = ohe.transform(df_val[categorical])
X_val_num = df_val[numerical].values
X_val = np.column_stack([X_val_cat, X_val_num])

In [18]:
y_pred = model.predict_proba(X_val)[:, 1]
converted_pred = (y_pred >= 0.5)
converted_pred

array([ True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True, False,  True, False,  True,  True,
       False, False,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True, False, False,  True, False,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
       False,  True,  True, False,  True,  True, False,  True,  True,
       False,  True,

In [19]:
round((y_val == converted_pred).mean(), 2)

np.float64(0.7)

In [20]:
original_acc = (y_val == converted_pred).mean()

In [21]:
columns = ['industry', 'employment_status', 'lead_score']
for col in columns:
    cat = list(df.select_dtypes(include=['object']))
    num = list(df.select_dtypes(include=['int64', 'float64']))[:-1]
    if col in num:
        num.remove(col)
    else:
        cat.remove(col)
    
    ohe_test = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_train_cat = ohe_test.fit_transform(df_train[cat])
    X_train_num = df_train[num].values
    X_train = np.column_stack([X_train_cat, X_train_num])

    X_val_cat = ohe_test.transform(df_val[cat])
    X_val_num = df_val[num].values
    X_val = np.column_stack([X_val_cat, X_val_num])

    model_test = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_test.fit(X_train, y_train)

    y_pred = model_test.predict_proba(X_val)[:, 1]
    converted_pred = (y_pred >= 0.5)

    new_acc = (y_val == converted_pred).mean()
    diff = original_acc - new_acc
    print(col, diff)
    

industry 0.0
employment_status 0.0034129692832763903
lead_score -0.0068259385665528916


In [22]:
from sklearn.linear_model import Ridge

C = [0.01, 0.1, 1, 10, 100]

X_train_cat = ohe.fit_transform(df_train[categorical])
X_train_num = df_train[numerical].values
X_train = np.column_stack([X_train_cat, X_train_num])

X_val_cat = ohe.transform(df_val[categorical])
X_val_num = df_val[numerical].values
X_val = np.column_stack([X_val_cat, X_val_num])

for r in C:
    model_reg = Ridge(alpha=r, random_state=1)
    model_reg.fit(X_train, y_train)

    y_pred = model_reg.predict(X_val)
    converted_pred = (y_pred >= 0.5)
    acc = (y_val == converted_pred).mean()
    
    print(f'r: {r}, bias: {model_reg.intercept_}, accuracy: {acc:.3f}')

r: 0.01, bias: -0.17537157147403337, accuracy: 0.850
r: 0.1, bias: -0.17514227260065518, accuracy: 0.850
r: 1, bias: -0.17287822928289331, accuracy: 0.846
r: 10, bias: -0.15277263569055133, accuracy: 0.857
r: 100, bias: -0.0559361735250522, accuracy: 0.836
