In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mutual_info_score

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [None]:
#Data Preparation

In [5]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [7]:
categorical_columns = list(df.dtypes[df.dtypes =='object'].index)

In [21]:
for c in categorical_columns:
    df[c] = df[c].fillna('NA')

In [132]:
numerical_columns = list(df.dtypes[df.dtypes !='object'].index)
numerical_columns.remove('converted')
numerical_columns

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [20]:
for n in numerical_columns:
    df[n] = df[n].fillna(0.0)

In [3]:
#Question 1

In [4]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [12]:
#Question 2

In [13]:
df.converted

0       1
1       0
2       1
3       0
4       1
       ..
1457    1
1458    1
1459    1
1460    1
1461    1
Name: converted, Length: 1462, dtype: int64

In [15]:
df[numerical_columns].corrwith(df.converted)

number_of_courses_viewed    0.435914
annual_income               0.078256
interaction_count           0.374573
lead_score                  0.193673
converted                   1.000000
dtype: float64

In [37]:
#Question 3

def mutual_info_converted(series):
    return mutual_info_score(series,y_train)

In [38]:
mi = df_train[categorical_columns].apply(mutual_info_converted)

In [22]:
mutual_info_score(df.lead_source,df.converted)

0.026573987738060995

In [None]:
mi.sort_values(ascending=False)

In [39]:
mi.sort_values(ascending=False)

lead_source          0.026506
employment_status    0.014230
industry             0.011724
location             0.003520
dtype: float64

In [163]:
#Validation framrwork
from sklearn.model_selection import train_test_split

In [164]:
df_full_train, df_test = train_test_split(df,test_size=0.2,random_state=42)

In [165]:
df_train, df_val = train_test_split(df,test_size=0.25,random_state=42)

In [166]:
len(df), len(df_full_train), len(df_test), len(df_train), len(df_val)

(1462, 1169, 293, 1096, 366)

In [168]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [167]:
y_train = df_train['converted']
y_val = df_val['converted']
y_test = df_test['converted']

In [169]:
#One hot encoding
from sklearn.feature_extraction import DictVectorizer

In [170]:
train_dicts = df_train[categorical_columns].to_dict(orient='records')

In [171]:
dv = DictVectorizer(sparse=False)

In [172]:
dv.fit(train_dicts)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [173]:
dv.transform(train_dicts)

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], shape=(1096, 21))

In [50]:
dv.get_feature_names_out()

array(['employment_status=NA', 'employment_status=employed',
       'employment_status=self_employed', 'employment_status=student',
       'employment_status=unemployed', 'industry=NA',
       'industry=education', 'industry=finance', 'industry=healthcare',
       'industry=manufacturing', 'industry=other', 'industry=retail',
       'industry=technology', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america'], dtype=object)

In [174]:
X_train = dv.fit_transform(train_dicts)

In [175]:
train_dicts = df_train[categorical_columns].to_dict(orient='records')
val_dicts = df_val[categorical_columns+numerical_columns].to_dict(orient='records')

In [176]:
X_val = dv.transform(val_dicts)

In [55]:
#Question 4

In [177]:
from sklearn.linear_model import LogisticRegression

In [178]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [179]:
model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [180]:
y_pred = model.predict_proba(X_val)[:,1]

In [181]:
convert_decision = (y_pred >= 0.5)

In [161]:
#y_pred

In [183]:
og_acc = round((y_val == convert_decision).mean(),4)

In [139]:
convert_decision.astype(int)

array([1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

np.float64(0.0)

array([[ True, False],
       [False,  True],
       [False,  True],
       ...,
       [ True, False],
       [ True, False],
       [False,  True]], shape=(1096, 2))

In [140]:
#Question6
params = [0.01, 0.1, 1, 10, 100]
for p in params: 
    model = LogisticRegression(solver='liblinear', C=p, max_iter=1000, random_state=42)
    model.fit(X_train,y_train)
    y_pred = model.predict_proba(X_val)[:,1]
    convert_decision = (y_pred >= 0.5)
    print(p,round((y_val == convert_decision).mean(),2))

0.01 0.68
0.1 0.63
1 0.62
10 0.61
100 0.61


In [184]:
#Question 5
features = ['industry','employment_status','lead_source']
categorical_columns = list(df.dtypes[df.dtypes =='object'].index)

#categorical_columns = list(df.dtypes[df.dtypes =='object'].index)
categorical_columns.remove('lead_source')
for c in categorical_columns:
    df[c] = df[c].fillna('NA')
train_dicts = df_train[categorical_columns].to_dict(orient='records')
val_dicts = df_val[categorical_columns+numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
y_pred = model.predict_proba(X_val)[:,1]
convert_decision = (y_pred >= 0.5)
abs(og_acc-round((y_val == convert_decision).mean(),4))


TypeError: abs() takes exactly one argument (2 given)

In [153]:
categorical_columns = list(df.dtypes[df.dtypes =='object'].index)
categorical_columns

['lead_source', 'industry', 'employment_status', 'location']