In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
print(df.head())

    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  


In [2]:
df['industry'].mode([0])

0    retail
Name: industry, dtype: object

In [3]:
#data preparation
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [4]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [5]:
df.lead_source = df.lead_source.fillna('NA')
df.industry = df.industry.fillna('NA')
df.annual_income = df.annual_income.fillna(0)
df.employment_status = df.employment_status.fillna('NA')
df.location = df.location.fillna('NA')
print(df.isnull().sum())

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [6]:
df['industry'].mode([0])


0    retail
Name: industry, dtype: object

In [31]:
#Q2 Correlations
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
corr = df[numerical].corrwith(df['converted'])
print(corr)

number_of_courses_viewed    0.435914
annual_income               0.053131
interaction_count           0.374573
lead_score                  0.193673
dtype: float64


In [8]:
#Q3 splitting the data
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
len(df_full_train), len(df_test)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)


(876, 293, 293)

In [9]:
#drop indexes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

print(df_train.head())

      lead_source       industry  number_of_courses_viewed  annual_income  \
0        paid_ads         retail                         0        58472.0   
1  organic_search  manufacturing                         3        71738.0   
2        paid_ads     technology                         3        81973.0   
3              NA     technology                         1        74956.0   
4  organic_search         retail                         3        59335.0   

  employment_status       location  interaction_count  lead_score  converted  
0           student    middle_east                  5        0.03          0  
1           student    middle_east                  6        0.77          1  
2          employed  north_america                  2        0.59          1  
3          employed         europe                  3        0.34          1  
4           student      australia                  1        0.98          1  


In [10]:
# preparing y
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values
del df_train['converted']
del df_val['converted']
del df_test['converted']

print(df_train, y_train)

        lead_source       industry  number_of_courses_viewed  annual_income  \
0          paid_ads         retail                         0        58472.0   
1    organic_search  manufacturing                         3        71738.0   
2          paid_ads     technology                         3        81973.0   
3                NA     technology                         1        74956.0   
4    organic_search         retail                         3        59335.0   
..              ...            ...                       ...            ...   
871  organic_search          other                         1        43907.0   
872    social_media         retail                         3        64969.0   
873              NA      education                         3        89042.0   
874    social_media  manufacturing                         1            0.0   
875          events        finance                         4        50259.0   

    employment_status       location  interaction_c

In [11]:
#q3 mutual information score
from sklearn.metrics import mutual_info_score
mutual_info_score(y_train, df_train.industry)

0.011574521435657112

In [12]:
mutual_info_score(y_train, df_train.location)

0.004464157884038034

In [13]:
mutual_info_score(y_train, df_train.lead_source)

0.03539624379726594

In [14]:
mutual_info_score(y_train, df_train.employment_status)

0.012937677269442782

In [15]:
#One-hot encoding of variables
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
categorical = ['lead_source', 'industry', 'employment_status', 'location']
from sklearn.feature_extraction import DictVectorizer
train_dict = df_train[categorical + numerical].to_dict(orient='records')
train_dict[0]

{'lead_source': 'paid_ads',
 'industry': 'retail',
 'employment_status': 'student',
 'location': 'middle_east',
 'number_of_courses_viewed': 0,
 'annual_income': 58472.0,
 'interaction_count': 5,
 'lead_score': 0.03}

In [16]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
# DictVectorizer(sparse=False)
# dv.get_feature_names_out()
X_train = dv.transform(train_dict)

In [17]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

#Train the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
model.coef_

array([[-1.77843877e-05, -1.47154423e-02,  3.39095225e-02,
         2.66248432e-03,  1.15238518e-02, -1.02527697e-01,
        -2.48510995e-02,  4.93604222e-02, -2.01258344e-02,
        -1.34214865e-02, -3.00232200e-03, -9.25991830e-03,
        -3.17957304e-02, -1.60513114e-02,  3.11339155e-01,
         5.12012528e-02,  2.01511698e-02, -1.20346284e-02,
        -1.16021521e-02, -1.15251880e-01,  7.95303436e-02,
        -2.99401329e-02,  3.95843295e-03, -1.14296944e-02,
        -1.12457415e-02, -5.59987025e-03,  8.26402635e-03,
         5.58598769e-03, -3.33967159e-02, -2.52837052e-02,
         4.53752887e-01]])

In [18]:
#Evaluating the model
y_pred = model.predict_proba(X_val)[:, 1]

In [19]:
y_pred = y_pred.astype(int)

In [20]:
(y_val == y_pred).mean().round(3)

np.float64(0.444)

In [21]:
y_pred_on_train_data = model.predict_proba(X_train)[:, 1]
y_pred_on_train = y_pred_on_train_data.astype(int)
(y_train == y_pred_on_train).mean().round(3)

np.float64(0.376)

In [22]:
#Q6 Using different C

#for C in [0.01, 0.1, 1, 10, 100]:
model = LogisticRegression(solver='liblinear', C=0.01, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
model.coef_
y_pred = model.predict_proba(X_val)[:, 1]
y_pred = y_pred.astype(int)
accuracy = (y_val == y_pred).mean().round(3)
print ('C =0.01', f'accuracy = {accuracy}')

C =0.01 accuracy = 0.444


In [23]:
model = LogisticRegression(solver='liblinear', C=0.1, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
model.coef_
y_pred = model.predict_proba(X_val)[:, 1]
y_pred = y_pred.astype(int)
accuracy = (y_val == y_pred).mean().round(3)
print ('C =0.1', f'accuracy = {accuracy}')

C =0.1 accuracy = 0.444


In [24]:
model = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
model.coef_
y_pred = model.predict_proba(X_val)[:, 1]
y_pred = y_pred.astype(int)
accuracy = (y_val == y_pred).mean().round(3)
print ('C =1', f'accuracy = {accuracy}')

C =1 accuracy = 0.444


In [25]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
model.coef_
y_pred = model.predict_proba(X_val)[:, 1]
y_pred = y_pred.astype(int)
accuracy = (y_val == y_pred).mean().round(3)
print ('C =10', f'accuracy = {accuracy}')

C =10 accuracy = 0.444


In [26]:
model = LogisticRegression(solver='liblinear', C=100, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
model.coef_
y_pred = model.predict_proba(X_val)[:, 1]
y_pred = y_pred.astype(int)
accuracy = (y_val == y_pred).mean().round(3)
print ('C = 100', f'accuracy = {accuracy}')

C = 100 accuracy = 0.444


In [27]:
#Q4
model = LogisticRegression(solver='liblinear', C=0.1, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
model.coef_
y_pred = model.predict_proba(X_val)[:, 1]
y_pred = y_pred.astype(int)
accuracy = (y_val == y_pred).mean().round(3)
print ('C =0.1', f'accuracy = {accuracy}')

C =0.1 accuracy = 0.444


In [28]:
#Q5
#FULL MODEL:
#from sklearn.feature_extraction import DictVectorizer
#train_dict = df_train.to_dict(orient='records')
#dv = DictVectorizer(sparse=False)
#dv.fit(train_dict)
#X_train = dv.transform(train_dict)
#val_dict = df_val.to_dict(orient='records')
#X_val = dv.transform(val_dict)

#Train the model

list = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score','lead_source', 'industry', 'employment_status', 'location']
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

for i in range(len(list) - 1):
    removed = list.pop(0)
    train_dict = df_train[list].to_dict(orient='records')
    #print(train_dict[0])
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    X_train = dv.transform(train_dict)
    val_dict = df_val[list].to_dict(orient='records')
    #print(val_dict[0])
    X_val = dv.transform(val_dict)
    model.fit(X_train,y_train)
    model.coef_
    y_pred = model.predict_proba(X_val)[:, 1]
    y_pred = y_pred.astype(int)
    accuracy = (y_val == y_pred).mean()
    print(removed, accuracy)

number_of_courses_viewed 0.44368600682593856
annual_income 0.44368600682593856
interaction_count 0.44368600682593856
lead_score 0.44368600682593856
lead_source 0.44368600682593856
industry 0.44368600682593856
employment_status 0.44368600682593856


In [29]:
list = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score','lead_source', 'industry', 'employment_status', 'location']
for i in range(len(list) - 1):
    list.pop(0)
    print(list[0])

annual_income
interaction_count
lead_score
lead_source
industry
employment_status
location
