## Module 3: Homework

In [1]:
import pandas as pd
import numpy as np

### 1. Data Prep

In [2]:
df = pd.read_csv("../data/course_lead_scoring.csv")

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
for c in df.dtypes[df.dtypes == "object"].index:
    df[c] = df[c].fillna("na")
    df[c] = df[c].str.lower().str.replace(" ", "_")

In [5]:
df["annual_income"] = df.annual_income.fillna(0.0)

### 2. EDA: Correlation

Q1: Mode of industry

In [6]:
df.industry.mode()

0    retail
Name: industry, dtype: object

In [7]:
df.dtypes.index

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [8]:
num_vars = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

Q2: Biggest correlation

In [9]:
df[num_vars].corrwith(df.lead_score).abs().sort_values(ascending=False)

lead_score                  1.000000
annual_income               0.015610
interaction_count           0.009888
number_of_courses_viewed    0.004879
dtype: float64

In [10]:
df[num_vars].corrwith(df.interaction_count).abs().sort_values(ascending=False)

interaction_count           1.000000
annual_income               0.027036
number_of_courses_viewed    0.023565
lead_score                  0.009888
dtype: float64

### 3. Validation Framework

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [13]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train["converted"]
del df_val["converted"]
del df_test["converted"]

### 4. EDA: Mutal Information

Q3: Mutual information score

In [14]:
from sklearn.metrics import mutual_info_score

In [15]:
cat_vars = ['lead_source', 'industry', 'employment_status', 'location']

In [16]:
mi = df_train[cat_vars].apply(lambda s: mutual_info_score(s, y_train))
round(mi.sort_values(ascending=False), 2)

lead_source          0.04
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

### 5. One-hot Encoding

In [17]:
from sklearn.feature_extraction import DictVectorizer

In [18]:
dv = DictVectorizer(sparse=False)

dict_train = df_train[num_vars + cat_vars].to_dict(orient="records")
X_train = dv.fit_transform(dict_train)

In [19]:
dict_val = df_val[num_vars + cat_vars].to_dict(orient="records")
X_val = dv.transform(dict_val)

### 6. Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [22]:
model.intercept_[0]

np.float64(-0.06914728027837501)

In [23]:
model.coef_[0]

array([-1.77843868e-05,  3.39095225e-02, -1.47154423e-02,  2.66248432e-03,
        1.15238518e-02, -1.02527697e-01,  4.93604222e-02, -2.01258344e-02,
       -1.34214865e-02, -3.00232200e-03, -2.48510995e-02, -9.25991830e-03,
       -3.17957304e-02, -1.60513114e-02,  3.11339155e-01,  5.12012528e-02,
       -1.20346284e-02,  2.01511698e-02, -1.16021521e-02, -1.15251880e-01,
        7.95303436e-02, -2.99401329e-02, -1.14296944e-02, -1.12457415e-02,
       -5.59987025e-03,  8.26402635e-03,  5.58598769e-03,  3.95843295e-03,
       -3.33967159e-02, -2.52837052e-02,  4.53752887e-01])

In [24]:
y_pred = model.predict(X_val)
y_pred

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0])

Q4: Accuracy

In [25]:
org_accuracy = (y_val == y_pred).mean()
round(org_accuracy, 3)

np.float64(0.7)

### 7. Feature Elimination

In [26]:
from IPython.display import display

fe_model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [27]:
def feature_elimination(df_train, f, df_val):
    df_train = df_train.copy()
    del df_train[f]
    
    dict_train = df_train.to_dict(orient="records")
    X_train = dv.fit_transform(dict_train)

    dict_val = df_val.to_dict(orient="records")
    X_val = dv.transform(dict_val)
    
    fe_model.fit(X_train, y_train)
    y_pred = fe_model.predict(X_val)
    accuracy = (y_val == y_pred).mean()
    diff = org_accuracy - accuracy
    print(f)
    display("Accuracy", round(accuracy, 3), "Diff", round(diff, 3))
    diffs[f] = abs(diff)
    print()

In [28]:
diffs = {}
for f in df_train.dtypes.index:
    feature_elimination(df_train, f, df_val)

lead_source


'Accuracy'

np.float64(0.703)

'Diff'

np.float64(-0.003)


industry


'Accuracy'

np.float64(0.7)

'Diff'

np.float64(0.0)


number_of_courses_viewed


'Accuracy'

np.float64(0.556)

'Diff'

np.float64(0.143)


annual_income


'Accuracy'

np.float64(0.853)

'Diff'

np.float64(-0.154)


employment_status


'Accuracy'

np.float64(0.696)

'Diff'

np.float64(0.003)


location


'Accuracy'

np.float64(0.71)

'Diff'

np.float64(-0.01)


interaction_count


'Accuracy'

np.float64(0.556)

'Diff'

np.float64(0.143)


lead_score


'Accuracy'

np.float64(0.706)

'Diff'

np.float64(-0.007)




In [29]:
sorted(diffs.items(), key=lambda i: i[1])

[('industry', np.float64(0.0)),
 ('employment_status', np.float64(0.0034129692832763903)),
 ('lead_source', np.float64(0.0034129692832765013)),
 ('lead_score', np.float64(0.0068259385665528916)),
 ('location', np.float64(0.010238907849829393)),
 ('number_of_courses_viewed', np.float64(0.14334470989761094)),
 ('interaction_count', np.float64(0.14334470989761094)),
 ('annual_income', np.float64(0.15358361774744034))]

Q5: Accuracy difference

In [30]:
min(diffs, key=diffs.get)

'industry'

### 8. Regularization

Q6: C with best accuracy

In [31]:
for C in [0.01, 0.1, 1, 10, 100]:
    r_model = LogisticRegression(solver="liblinear", C=C, max_iter=1000, random_state=42)
    r_model.fit(X_train, y_train)
    y_pred = r_model.predict(X_val)
    score = (y_val == y_pred).mean()
    display(C, score)

0.01

np.float64(0.6996587030716723)

0.1

np.float64(0.6996587030716723)

1

np.float64(0.6996587030716723)

10

np.float64(0.6996587030716723)

100

np.float64(0.6996587030716723)