In [293]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [294]:
data = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
data.head()


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [295]:
data.describe()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
count,1462.0,1281.0,1462.0,1462.0,1462.0
mean,2.031464,59886.273224,2.976744,0.506108,0.619015
std,1.449717,15070.140389,1.681564,0.288465,0.485795
min,0.0,13929.0,0.0,0.0,0.0
25%,1.0,49698.0,2.0,0.2625,0.0
50%,2.0,60148.0,3.0,0.51,1.0
75%,3.0,69639.0,4.0,0.75,1.0
max,9.0,109899.0,11.0,1.0,1.0


In [296]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


# spliting columns into target categorical and numerical


In [297]:
target= 'converted'
categorical = ['employment_status','lead_source', 'industry','location']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count','lead_score']


In [298]:
data[numerical].head()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
0,1,79450.0,4,0.94
1,1,46992.0,1,0.8
2,5,78796.0,3,0.69
3,2,83843.0,1,0.87
4,3,85012.0,3,0.62


In [299]:
data[categorical].head()

Unnamed: 0,employment_status,lead_source,industry,location
0,unemployed,paid_ads,,south_america
1,employed,social_media,retail,south_america
2,unemployed,events,healthcare,australia
3,,paid_ads,retail,australia
4,self_employed,referral,education,europe


# filling missing values with NA and 0

In [300]:
data[categorical].isnull().sum(),data[numerical].isnull().sum()


(employment_status    100
 lead_source          128
 industry             134
 location              63
 dtype: int64,
 number_of_courses_viewed      0
 annual_income               181
 interaction_count             0
 lead_score                    0
 dtype: int64)

In [301]:
data[categorical] = data[categorical].fillna("NA")

data[numerical] = data[numerical].fillna(0.0)


In [302]:
data.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [303]:
for column in numerical:
    print(data[numerical].corrwith(data[column]).to_frame('correlation').max)

        


<bound method DataFrame.max of                           correlation
number_of_courses_viewed     1.000000
annual_income                0.009770
interaction_count           -0.023565
lead_score                  -0.004879>
<bound method DataFrame.max of                           correlation
number_of_courses_viewed     0.009770
annual_income                1.000000
interaction_count            0.027036
lead_score                   0.015610>
<bound method DataFrame.max of                           correlation
number_of_courses_viewed    -0.023565
annual_income                0.027036
interaction_count            1.000000
lead_score                   0.009888>
<bound method DataFrame.max of                           correlation
number_of_courses_viewed    -0.004879
annual_income                0.015610
interaction_count            0.009888
lead_score                   1.000000>


# splitting data


In [304]:
from sklearn.model_selection import train_test_split
train_val, test = train_test_split(data, test_size=0.2, random_state=42, stratify=data[target])
train, val = train_test_split(train_val, test_size=0.25, random_state=42, stratify=train_val[target])
len(train), len(val), len(test)


# train , val , test
# X= train - y, y= train[target]



(876, 293, 293)

In [305]:
y= train[target]
X= train.drop(columns=[target])
y_val= val[target]
X_val= val.drop(columns=[target])
y_test= test[target]
X_test= test.drop(columns=[target])

del val[target]
del train[target]
X.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
1015,social_media,other,4,53556.0,self_employed,africa,2,0.18
693,referral,healthcare,3,66872.0,unemployed,,3,0.03
141,events,healthcare,3,60375.0,student,australia,9,0.3
1206,,manufacturing,3,51271.0,unemployed,europe,3,0.6
529,social_media,healthcare,1,57537.0,student,south_america,2,0.28


In [306]:
from sklearn.metrics import mutual_info_score


In [307]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, y)


In [308]:
mi = train[categorical].apply(mutual_info_churn_score).round(2)
mi.sort_values(ascending=False)

lead_source          0.03
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

# one hot encoding

In [309]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


In [310]:

dv = DictVectorizer(sparse=False)

train_dict = train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

X_test = dv.transform(test[categorical + numerical].to_dict(orient='records'))


In [None]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y)


In [312]:
model.intercept_[0]


np.float64(-0.09245220977348707)

In [313]:
model.coef_[0].round(3)


array([-0.   , -0.001,  0.016, -0.001, -0.005, -0.101, -0.017,  0.027,
       -0.012, -0.012, -0.001, -0.028, -0.039, -0.01 ,  0.301,  0.052,
       -0.007, -0.041, -0.019, -0.098,  0.086, -0.013, -0.   , -0.013,
       -0.011, -0.016, -0.011,  0.002, -0.022, -0.022,  0.446])

In [314]:
y_pred = model.predict_proba(X_val)[:, 1]
converted_decision = (y_pred >= 0.5)
(y_val == converted_decision).mean().round(2)



np.float64(0.73)

In [320]:
# Original feature set and model parameters
features = categorical + numerical
model_params = dict(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Train model with all features
dv_all = DictVectorizer(sparse=False)
X_train_all = dv_all.fit_transform(train[features].to_dict(orient='records'))
X_val_all = dv_all.transform(val[features].to_dict(orient='records'))

model_all = LogisticRegression(**model_params)
model_all.fit(X_train_all, y)
y_pred_all = model_all.predict_proba(X_val_all)[:, 1]
acc_all = ((y_pred_all >= 0.5) == y_val).mean()

# Store accuracy drops
accuracy_drop = {}

for f in features:
    features_wo = [col for col in features if col != f]
    dv_wo = DictVectorizer(sparse=False)
    X_train_wo = dv_wo.fit_transform(train[features_wo].to_dict(orient='records'))
    X_val_wo = dv_wo.transform(val[features_wo].to_dict(orient='records'))

    model_wo = LogisticRegression(**model_params)
    model_wo.fit(X_train_wo, y)
    y_pred_wo = model_wo.predict_proba(X_val_wo)[:, 1]
    acc_wo = ((y_pred_wo >= 0.5) == y_val).mean()
    accuracy_drop[f] = acc_all - acc_wo

# Show results
print("Original accuracy:", acc_all)
print("Accuracy drop when excluding each feature:")
for f, drop in accuracy_drop.items():
    print(f"{f}: {drop:.4f}")

Original accuracy: 0.7303754266211604
Accuracy drop when excluding each feature:
employment_status: -0.0034
lead_source: -0.0068
industry: 0.0000
location: 0.0000
number_of_courses_viewed: 0.1092
annual_income: -0.1331
interaction_count: 0.1126
lead_score: 0.0000


In [328]:
C_values = [0.01, 0.1, 1, 10, 100]
accuracies = []

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y)
    y_pred_val = model.predict_proba(X_val)[:, 1]
    acc = ((y_pred_val >= 0.5) == y_val).mean()
    accuracies.append(round(acc, 10))

for C, acc in zip(C_values, accuracies):
    print(f"C={C}: Validation accuracy = {acc}")

print("Test set evaluation with best C:")
best_C = C_values[np.argmax(accuracies)]
print(f"Best C: {best_C}")


C=0.01: Validation accuracy = 0.7337883959
C=0.1: Validation accuracy = 0.7303754266
C=1: Validation accuracy = 0.7303754266
C=10: Validation accuracy = 0.7303754266
C=100: Validation accuracy = 0.7303754266
Test set evaluation with best C:
Best C: 0.01
