In [228]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [229]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [230]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [231]:
len(df)

1462

In [232]:
df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [233]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [234]:
# Check for missing values
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [235]:
# For categorical features, replace them with 'NA'
# For numerical features, replace them with 0.0
df.columns



Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [236]:
categorical = ['lead_source', 'industry','employment_status', 'location']

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count','lead_score']

In [237]:
df[categorical].isnull().sum()

lead_source          128
industry             134
employment_status    100
location              63
dtype: int64

In [238]:
df[numerical].isnull().sum()

number_of_courses_viewed      0
annual_income               181
interaction_count             0
lead_score                    0
dtype: int64

In [239]:
# Replace nulls in categorical features with 'NA'
df[categorical] = df[categorical].fillna('NA')

# Replace nulls in numerical features with 0.0
df[numerical] = df[numerical].fillna(0.0)
print(df[categorical].isnull().sum())
print(df[numerical].isnull().sum())


lead_source          0
industry             0
employment_status    0
location             0
dtype: int64
number_of_courses_viewed    0
annual_income               0
interaction_count           0
lead_score                  0
dtype: int64


In [240]:
#Question 1
#What is the most frequent observation (mode) for the column industry?
df['industry'].mode()
df['industry'].mode()[0]


'retail'

In [241]:

# Compute the correlation matrix
corr_matrix = df[numerical].corr()

print("Correlation Matrix:")
print(corr_matrix)
pairs = [
    ("interaction_count", "lead_score"),
    ("number_of_courses_viewed", "lead_score"),
    ("number_of_courses_viewed", "interaction_count"),
    ("annual_income", "interaction_count")
]

print("\nSelected Pairs Correlation:")
for col1, col2 in pairs:
    print(f"{col1} vs {col2}: {corr_matrix.loc[col1, col2]}")


Correlation Matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  

Selected Pairs Correlation:
interaction_count vs lead_score: 0.009888182496913131
number_of_courses_viewed vs lead_score: -0.004878998354681276
number_of_courses_viewed vs interaction_count: -0.023565222882888037
annual_income vs interaction_count: 0.02703647240481443


In [98]:
# annual_income and interaction_count have the biggest correlation.

In [242]:
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)


In [243]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)


In [244]:
print(len(df_train_full))
print(len(df_test))
print(len(df_val))
print(len(df_train))

1169
293
293
876


In [245]:
y_train = df_train.converted.values
y_val = df_val.converted.values


In [246]:
del df_train['converted']
del df_val['converted']

In [248]:
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.converted)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi.round(2)


lead_source          0.03
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [None]:
#The largest value is 0.025665 for lead_source.

# Answer: The variable with the biggest mutual information score is lead_source.

In [249]:
from sklearn.feature_extraction import DictVectorizer
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)


In [250]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [263]:
y_pred = model.predict_proba(X_val)[:, 1]
converted = (y_pred > 0.5)

((y_val == converted).mean())
print(round(acc, 2))


0.75


In [268]:
# Function to train & evaluate
def train_and_eval(features):
    train_dicts = df_train[features].to_dict(orient='records')
    val_dicts   = df_val[features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val   = dv.transform(val_dicts)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    converted = (y_pred > 0.5)

    acc = (y_val == converted).mean()
    return acc

# Baseline with all features
all_features = categorical + numerical
accuracy = train_and_eval(all_features)

# Evaluate feature elimination
for f in ['industry', 'employment_status', 'lead_score']:
    reduced_features = [col for col in all_features if col != f]
    acc_removed = train_and_eval(reduced_features)
    diff = acc_full - acc_removed
    print(f"{f} | Accuracy diff: {diff}")


industry | Accuracy diff: 0.0
employment_status | Accuracy diff: -0.0068259385665528916
lead_score | Accuracy diff: -0.0034129692832763903


In [279]:
C_values = [0.01, 0.1, 1, 10, 100]
scores = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    # Predict probabilities
    y_pred = model.predict_proba(X_val)[:, 1]

    # Convert to binary labels
    converted = (y_pred > 0.5)

    # Accuracy
    acc = (y_val == converted).mean()
    scores[C] = acc

    print(f"C={C:<6} -> Validation Accuracy: {round(acc, 3)}")

# Best C
best_C = max(scores, key=scores.get)
print("\nBest C:", best_C, "with accuracy:", round(scores[best_C], 3))




C=0.01   -> Validation Accuracy: 0.758
C=0.1    -> Validation Accuracy: 0.754
C=1      -> Validation Accuracy: 0.747
C=10     -> Validation Accuracy: 0.747
C=100    -> Validation Accuracy: 0.747

Best C: 0.01 with accuracy: 0.758
