# **Homework 3: Classification**
Goal: model a logistic regressor to predict the target `converted`

## 1. Import packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression

## 2. Load data and prepare data

Load data

In [2]:
df_raw = pd.read_csv("data/course_lead_scoring.csv")
df = df_raw.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df.tail()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1
1461,organic_search,finance,3,92855.0,student,north_america,3,0.41,1


Check for missing values per feature

In [5]:
# Check missing values per column
missing = df.isnull().sum()

# Percent of missing values per feature
missing_percent = round((missing / len(df)) * 100, 2)

# Table of missing values
missing_table = pd.DataFrame({'Missing Values': missing,
                              'Percent': missing_percent})

# Display only columns with missing values
df_missing = (
    missing_table[missing_table['Missing Values'] > 0]
    .sort_values('Percent', ascending=False)
)

df_missing

Unnamed: 0,Missing Values,Percent
annual_income,181,12.38
industry,134,9.17
lead_source,128,8.76
employment_status,100,6.84
location,63,4.31


Impute missing categorical and numerical feature with 'NA' and 0, respectively.

In [6]:
# Missing data columns
missing_cols = df_missing.index

# Impute with 'O' or 0.0
for col in missing_cols:
    if df[col].dtype.kind == 'O':
        df[col] = df[col].fillna('NA')
    else:
        df[col] = df[col].fillna(0.0)


In [7]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [8]:
df.tail()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1
1461,organic_search,finance,3,92855.0,student,north_america,3,0.41,1


Most frequent mode in 'industry'

In [9]:
df['industry'].mode(dropna=False)

0    retail
Name: industry, dtype: object

## 3. Setting up the validation framework

Split the data into train, validation, and test sets, 60-20-20 split

In [10]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [11]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

Reset indices

In [12]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

Extract target

In [13]:
y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

Remove target from among features

In [14]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

Check for duplicates

In [15]:
df.duplicated().sum()

np.int64(0)

## 4. Correlation
Between:

- `interaction_count` and `lead_score`
- `number_of_courses_viewed` and `lead_score`
- `number_of_courses_viewed` and `interaction_count`
- `annual_income` and `interaction_count`

In [16]:
pairs = [
    ("interaction_count", "lead_score"),
    ("number_of_courses_viewed", "lead_score"),
    ("number_of_courses_viewed", "interaction_count"),
    ("annual_income", "interaction_count"),
]

corrs = {
    f"{a} - {b}": round(df[[a, b]].corr(method="pearson").iloc[0, 1], 3)
    for a, b in pairs
}

# show correlations and the biggest (by absolute value)
corr_series = pd.Series(corrs).dropna()
print(corr_series.sort_values(key = np.abs, ascending=False))
print("\nBiggest correlation:", round(corr_series.iloc[np.argmax(np.abs(corr_series.values))], 3))


annual_income - interaction_count               0.027
number_of_courses_viewed - interaction_count   -0.024
interaction_count - lead_score                  0.010
number_of_courses_viewed - lead_score          -0.005
dtype: float64

Biggest correlation: 0.027


## 5. Feature importance: Mutual Information
Between target `converted` and categories on training set:
- `industry`
- `location`
- `lead_source`
- `employment_status`

In [17]:
# Helper function for mutual information computation
def mutual_info_converted_score(features_series, target_series = y_train):
    return mutual_info_score(features_series, target_series)

In [18]:
categories_list = ['industry', 'location', 'lead_source', 'employment_status']

mi = df_train[categories_list].apply(mutual_info_converted_score)
mi.sort_values(ascending=False)

lead_source          0.035396
employment_status    0.012938
industry             0.011575
location             0.004464
dtype: float64

## 6. One-hot encoding

In [19]:
# Instantiate the vectorizer
dv = DictVectorizer(sparse=False)

# list of numerical and categorical columns
numeric_list = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
categories_list = ['industry', 'location', 'lead_source', 'employment_status']

train_dict = df_train[categories_list + numeric_list].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categories_list + numeric_list].to_dict(orient='records')
X_val = dv.transform(val_dict)

## 7. Fit a logistic regression

In [20]:
# Instantiate the logistic regression algorithm
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Fit the model to the training dataset
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


## 8. Evaluate the model on validation set

In [21]:
# Get probabilities
y_pred = model.predict_proba(X_val)[:, 1]

#Threshold at 0.5
converted_decision = (y_pred >= 0.5)

# Compute accuracy
accuracy = round((y_val == converted_decision).mean(), 2)
accuracy

np.float64(0.7)

## 9. Feature selection

In [27]:
# Helper function to train, predict, and compute accuracy
def eval_accuracy(drop_cols=None):
    drop_cols = set(drop_cols or [])
    cats = [c for c in categories_list if c not in drop_cols]
    nums = [c for c in numeric_list    if c not in drop_cols]
    cols = cats + nums

    # Vectorize (fit on train, reuse on val)
    dv = DictVectorizer(sparse=False)
    X_trn = dv.fit_transform(df_train[cols].to_dict(orient="records"))
    X_vl  = dv.transform(   df_val[cols].to_dict(orient="records"))

    # Model (train on same feature space)
    lr = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    lr.fit(X_trn, y_train)

    # predict
    y_proba = lr.predict_proba(X_vl)[:, 1]

    # threshold & accuracy
    y_hat = (y_proba >= 0.5)
    acc = (y_val == y_hat).mean()
    return acc

Compute accuracy and difference accuracies for different feature groups

In [28]:
# Baseline
base_accu = eval_accuracy()

# Drop each listed feature and compare
features_to_test = ['industry', 'employment_status', 'lead_score']
results = []
for f in features_to_test:
    acc = eval_accuracy(drop_cols=[f])
    results.append((f, acc, base_accu - acc))

print(f"Baseline accuracy: {base_accu:.4f}")
for f, acc, delta in results:
    print(f"Drop {f:>18}: acc={acc:.4f}  Δacc={delta:+.4f}")

Baseline accuracy: 0.6997
Drop           industry: acc=0.6997  Δacc=+0.0000
Drop  employment_status: acc=0.6962  Δacc=+0.0034
Drop         lead_score: acc=0.7065  Δacc=-0.0068
