In [105]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

In [106]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [107]:
df.head(5)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [108]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [109]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [110]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)
numerical = list(df.dtypes[(df.dtypes == 'int64')  | (df.dtypes == 'float64') ].index)
numerical.remove('converted')

In [111]:
numerical, categorical

(['number_of_courses_viewed',
  'annual_income',
  'interaction_count',
  'lead_score'],
 ['lead_source', 'industry', 'employment_status', 'location'])

In [112]:
df[categorical]=df[categorical].fillna('NA')
df[numerical]=df[numerical].fillna(0.0)

In [113]:
df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

## Q1

In [114]:
df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [115]:
df['industry'].mode()[0]

'retail'

## Q2

In [116]:
df[numerical].head(5)

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
0,1,79450.0,4,0.94
1,1,46992.0,1,0.8
2,5,78796.0,3,0.69
3,2,83843.0,1,0.87
4,3,85012.0,3,0.62


In [117]:
df[numerical].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [118]:
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

for f1, f2 in pairs:
    corr = df[[f1, f2]].corr().iloc[0, 1]
    print(f"{f1} vs {f2}: {corr:.3f}")

interaction_count vs lead_score: 0.010
number_of_courses_viewed vs lead_score: -0.005
number_of_courses_viewed vs interaction_count: -0.024
annual_income vs interaction_count: 0.027


In [119]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [120]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

## Q3

In [121]:
y_train=df_train.converted.values
y_val=df_val.converted.values
y_test=df_test.converted.values

In [122]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [123]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val= df_val.reset_index(drop=True)

In [124]:
X_train=df_train.values
X_val=df_val.values
X_test=df_test.values

In [125]:
def mi_score(series):
    return mutual_info_score(series, y_train)
    
mi = round(df_train[categorical].apply(mi_score),2)
mi.sort_values(ascending=False)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

## Q4

In [126]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [127]:
X_train

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 31))

In [128]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [129]:
y_pred = model.predict_proba(X_val)[:, 1]

In [130]:
accuracy_score(y_val, y_pred >= 0.5)

0.6996587030716723

In [133]:
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(round(acc, 4))

0.6997


In [84]:
initial_acc=accuracy_score(y_val, y_pred >= 0.5)

## Q5

In [85]:
categorical+numerical

['lead_source',
 'industry',
 'employment_status',
 'location',
 'number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [142]:
for feature in ['industry', 'employment_status', 'lead_score']:
    selected_columns=categorical+numerical
    selected_columns.remove(feature)
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[selected_columns].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[selected_columns].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    acc_sc1=accuracy_score(y_val, y_pred >= 0.5)
    
    y_pred = model.predict(X_val)
    acc_sc2 = accuracy_score(y_val, y_pred)
    print(feature,initial_acc-acc_sc1,initial_acc-acc_sc2)
    

industry 0.0 0.0
employment_status 0.0034129692832763903 0.0034129692832763903
lead_score -0.0068259385665528916 -0.0068259385665528916


## Q6

In [134]:
selected_columns=categorical+numerical

dv = DictVectorizer(sparse=False)
train_dict = df_train[selected_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[selected_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)


for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    acc_sc1=accuracy_score(y_val, y_pred >= 0.5)
    y_pred = model.predict(X_val)
    acc_sc2 = accuracy_score(y_val, y_pred)
    acc_sc3 = roc_auc_score(y_val, y_pred)
    print(c, acc_sc1,acc_sc2,acc_sc3)

0.01 0.6996587030716723 0.6996587030716723 0.6669891458235017
0.1 0.6996587030716723 0.6996587030716723 0.6677678150070787
1 0.6996587030716723 0.6996587030716723 0.6677678150070787
10 0.6996587030716723 0.6996587030716723 0.6677678150070787
100 0.6996587030716723 0.6996587030716723 0.6677678150070787
