In [198]:
#Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from IPython.display import display
from sklearn.metrics import accuracy_score

%matplotlib inline

In [199]:
url_dataset = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
df = pd.read_csv(url_dataset)

In [200]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [201]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [202]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [203]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [204]:
#Cleaning Data

df.columns = df.columns.str.lower().str.replace(' ', '_')

for col in df.columns:
    if df[col].dtypes == 'object':
        df[col] = df[col].str.lower().str.replace(' ', '_')

In [205]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [206]:
#Cateogrization of features

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

categorical = ['lead_source', 'industry', 'employment_status', 'location']


In [207]:
#Dealing with nulls

df[numerical] = df[numerical].fillna(0.0)
df[categorical] = df[categorical].fillna('NA')

In [208]:
# Question 1
df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [209]:
df.industry.mode()

0    retail
Name: industry, dtype: object

In [210]:
# Question 2

corr_matrix = df[numerical].corr()
corr_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


What are the two features that have the biggest correlation?

* interaction_count and lead_score
* number_of_courses_viewed and lead_score
* number_of_courses_viewed and interaction_count
* <b>annual_income and interaction_count</b>

0.009888, -0.004879, -0.023565, <b>0.027036</b>

In [211]:
#Setting up Validation Framework

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values
y_full_train = df_full_train.converted.values

df_full_train.reset_index(drop=True, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

copy_df_train = df_train.copy()

del df_full_train['converted']
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [212]:
# Question 3

results = []

for c in categorical:
    results.append({'Column':c, 'Mutual_Info_Score_Converted':round(mutual_info_score(copy_df_train[c], copy_df_train['converted']),2)})
    
pd.DataFrame(results)

Unnamed: 0,Column,Mutual_Info_Score_Converted
0,lead_source,0.04
1,industry,0.01
2,employment_status,0.01
3,location,0.0


In [213]:
#Some EDA
global_converstion = df['converted'].mean()
global_converstion

np.float64(0.6190150478796169)

In [214]:
# One-hot encoding
dv = DictVectorizer(sparse=False)

#Training dataset
dicts_train = df_train.to_dict(orient='records')
x_train = dv.fit_transform(dicts_train)

#Validation dataset
dicts_val = df_val.to_dict(orient='records')
x_val = dv.transform(dicts_val)


In [215]:
x_train, x_train.shape

(array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
         3.0000e+00],
        [8.1973e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
         3.0000e+00],
        ...,
        [8.9042e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
         3.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
         1.0000e+00],
        [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
         4.0000e+00]]),
 (876, 31))

In [216]:
x_val.shape

(293, 31)

In [217]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [218]:
#Training model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [219]:
model.fit(x_train, y_train)

In [220]:
y_pred = model.predict_proba(x_val)[:, 1]

In [221]:
y_pred

array([0.61192163, 0.79982617, 0.53021344, 0.47131479, 0.57066132,
       0.44227169, 0.87127669, 0.84883115, 0.83290038, 0.61497802,
       0.54968027, 0.78153088, 0.69039786, 0.77017122, 0.52659441,
       0.91706425, 0.53170635, 0.42123049, 0.30146455, 0.84881583,
       0.79488653, 0.73670375, 0.44527211, 0.64838383, 0.4176882 ,
       0.75393418, 0.90166116, 0.33903049, 0.43181431, 0.9680681 ,
       0.92018714, 0.37487989, 0.652301  , 0.90650057, 0.75164117,
       0.64202122, 0.82250075, 0.83375553, 0.659116  , 0.30978854,
       0.78942264, 0.35546366, 0.96517758, 0.63389304, 0.51274195,
       0.53230534, 0.82287785, 0.744074  , 0.73452314, 0.68955217,
       0.46964443, 0.84539253, 0.55635244, 0.92637871, 0.65258021,
       0.61526273, 0.63816996, 0.28304018, 0.48049825, 0.57890618,
       0.35497342, 0.62175052, 0.38960778, 0.61156056, 0.85304278,
       0.75430136, 0.89185954, 0.71946459, 0.95387623, 0.89209517,
       0.75277088, 0.33850139, 0.61376593, 0.51622275, 0.64088

In [222]:
convert_decision = (y_pred >= 0.5)

In [223]:
(y_val == convert_decision.astype(int)).mean()

np.float64(0.6996587030716723)

In [224]:
y_pred_binary = (y_pred >= 0.5).astype(int)
accuracy = accuracy_score(y_val, y_pred_binary)
print("Accuracy:", accuracy)

Accuracy: 0.6996587030716723


In [225]:
round(accuracy, 2)

0.7

In [226]:
# Question 5
y_train_pred = model.predict_proba(x_train)[:, 1]
y_train_pred_binary = (y_train_pred >= 0.5).astype(int)
original_accuracy = accuracy_score(y_train, y_train_pred_binary)
original_accuracy

0.7385844748858448

In [227]:
feature_list = list(df_train.columns)
feature_list

['lead_source',
 'industry',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count',
 'lead_score']

In [228]:
#This removes columns from end but not what we need exactly
for i in range(len(feature_list)):
    df_new_train = df_train[feature_list[:-(i+1)]]
    display(df_new_train.columns)

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count'],
      dtype='object')

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location'],
      dtype='object')

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status'],
      dtype='object')

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income'], dtype='object')

Index(['lead_source', 'industry', 'number_of_courses_viewed'], dtype='object')

Index(['lead_source', 'industry'], dtype='object')

Index(['lead_source'], dtype='object')

Index([], dtype='object')

In [229]:
feature_list + ['temp']

['lead_source',
 'industry',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count',
 'lead_score',
 'temp']

In [230]:
results = []

for i in feature_list:
    new_features = feature_list.copy()
    new_features.remove(i)
    df_new_train = df_train[new_features].copy()

    # One-hot encoding
    dv_new = DictVectorizer(sparse=False)

    #Training dataset
    dicts_train_new = df_new_train.to_dict(orient='records')
    x_train_new = dv_new.fit_transform(dicts_train_new)

    #Modelling
    model.fit(x_train_new, y_train)
    y_pred_new = model.predict_proba(x_train_new)[:, 1]
    y_pred_binary = (y_pred_new >= 0.5).astype(int)
    acc_diff = original_accuracy - accuracy_score(y_train, y_pred_binary)

    results.append({'Feature Removed': i, 'Accuracy Difference': acc_diff})

    

In [231]:
pd.DataFrame(results)

Unnamed: 0,Feature Removed,Accuracy Difference
0,lead_source,-0.003425
1,industry,-0.002283
2,number_of_courses_viewed,0.115297
3,annual_income,-0.13242
4,employment_status,0.003425
5,location,-0.003425
6,interaction_count,0.113014
7,lead_score,-0.003425


In [249]:
#Question 6

results = []

for c_val in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', max_iter=1000, random_state=42, C=c_val)
    model.fit(x_train, y_train)
    val_score = round(model.score(x_val, y_val), 3)
    results.append({'C_Val':c_val, 'Val_Score':val_score})


In [250]:
pd.DataFrame(results)

Unnamed: 0,C_Val,Val_Score
0,0.01,0.7
1,0.1,0.7
2,1.0,0.7
3,10.0,0.7
4,100.0,0.7


In [251]:
results = []
for c_val in [0.00001, 0.001, 0.1, 1, 10, 100, 1000000000, 10000000000000]:
    model = LogisticRegression(solver='liblinear', max_iter=1000, random_state=42, C=c_val)
    model.fit(x_train, y_train)
    val_score = round(model.score(x_val, y_val), 3)
    results.append({'C_Val':c_val, 'Val_Score':val_score})

In [252]:
pd.DataFrame(results)

Unnamed: 0,C_Val,Val_Score
0,1e-05,0.556
1,0.001,0.563
2,0.1,0.7
3,1.0,0.7
4,10.0,0.7
5,100.0,0.7
6,1000000000.0,0.7
7,10000000000000.0,0.7
