In [60]:
import pandas as pd
import numpy as np

In [61]:
# Read a CSV file named 
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

# Print the first 5 rows to verify
print(df.head())

    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  


In [62]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [63]:
df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [64]:
#Q1
industry_counts = df['industry'].value_counts()

print("\nFrequency of each industry:")
print(industry_counts)


Frequency of each industry:
industry
retail           203
finance          200
other            198
education        187
healthcare       187
technology       179
manufacturing    174
Name: count, dtype: int64


In [65]:
print(df.isnull().sum())

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [66]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count',
       'lead_score']
categorical = ['lead_source','industry','employment_status','location']

In [67]:
df[numerical] = df[numerical].fillna(0)
df[categorical] = df[categorical].fillna('NA')

In [68]:
print(df.isnull().sum())

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [69]:
from sklearn.model_selection import train_test_split

In [70]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [71]:
df_full_train['interaction_count'].corr(df_full_train['lead_score'])

np.float64(0.011290499650258736)

In [72]:
df_full_train['number_of_courses_viewed'].corr(df_full_train['lead_score'])

np.float64(-0.010710522152641658)

In [73]:
df_full_train['number_of_courses_viewed'].corr(df_full_train['interaction_count'])

np.float64(-0.026417136351258818)

In [74]:
df_full_train['annual_income'].corr(df_full_train['interaction_count'])

np.float64(0.06896871371403927)

In [75]:
from sklearn.metrics import mutual_info_score

In [76]:
mutual_info_score(df_full_train.industry, df_full_train.converted)

0.008173022583466888

In [77]:
mutual_info_score(df_full_train.location, df_full_train.converted)

0.0012115327428980638

In [78]:
mutual_info_score(df_full_train.lead_source, df_full_train.converted)

0.024561797800259202

In [79]:
mutual_info_score(df_full_train.employment_status, df_full_train.converted)

0.012690204266619348

In [80]:
print(df_train.isnull().sum())

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [81]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [82]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [83]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [59]:
df_full_train = df_full_train.reset_index(drop=True)

In [84]:
from sklearn.feature_extraction import DictVectorizer

In [85]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [86]:
X_train

array([[9.5543e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       [5.4924e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [7.7352e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       ...,
       [7.3702e+04, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        1.0000e+00],
       [9.3341e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]], shape=(876, 31))

In [87]:
from sklearn.linear_model import LogisticRegression

In [100]:
model = LogisticRegression(solver='liblinear', C=0.01, max_iter=1000, random_state=42)

In [101]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.01
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [90]:
model.intercept_[0]

np.float64(-0.10464329280767921)

In [91]:
model.coef_[0].round(3)

array([-0.   , -0.027,  0.035, -0.01 ,  0.014, -0.117, -0.022,  0.033,
       -0.008, -0.023, -0.007, -0.033, -0.027, -0.018,  0.326,  0.032,
        0.   , -0.005, -0.024, -0.112,  0.07 , -0.034,  0.005, -0.012,
       -0.01 , -0.028, -0.013, -0.019, -0.023, -0.005,  0.453])

In [102]:
y_pred = model.predict_proba(X_val)[:, 1]

In [94]:
y_pred[0],y_val[0]

(np.float64(0.5753950345682656), np.int64(1))

In [103]:
converted_decision = (y_pred >= 0.5)

In [96]:
converted_decision

array([ True, False,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
        True,  True, False,  True, False, False,  True,  True, False,
       False,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True, False,

In [104]:
(y_val == converted_decision).mean()

np.float64(0.6996587030716723)

In [98]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = converted_decision.astype(int)
df_pred['actual'] = y_val

In [99]:
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred.correct.mean()

np.float64(0.6996587030716723)

In [129]:
model2 = LogisticRegression(solver='liblinear', C=100, max_iter=1000, random_state=42)
model2.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,100
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [130]:
y_pred2 = model2.predict_proba(X_val)[:, 1]

In [131]:
converted_decision2 = (y_pred2 >= 0.5)

In [128]:
(y_val == converted_decision2).mean()

np.float64(0.6996587030716723)