# **Homework 4: Classification Evaluation Metrics**
Goal: model and evaluate a logistic regressor to predict the target `converted` - has the client signed up to the platform or not?

## 1. Import packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import (mutual_info_score, accuracy_score,
                             roc_curve, auc, roc_auc_score)
from sklearn.linear_model import LogisticRegression

## 2. Load data and prepare data

Load data

In [2]:
df_raw = pd.read_csv("data/course_lead_scoring.csv")
df = df_raw.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df.tail()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1
1461,organic_search,finance,3,92855.0,student,north_america,3,0.41,1


Check for missing values per feature

In [5]:
# In target
df['converted'].value_counts(dropna=False)

converted
1    905
0    557
Name: count, dtype: int64

There is no missing value in the target.

In [6]:
# Check missing values per column
missing = df.isnull().sum()

# Percent of missing values per feature
missing_percent = round((missing / len(df)) * 100, 2)

# Table of missing values
missing_table = pd.DataFrame({'Missing Values': missing,
                              'Percent': missing_percent})

# Display only columns with missing values
df_missing = (
    missing_table[missing_table['Missing Values'] > 0]
    .sort_values('Percent', ascending=False)
)

df_missing

Unnamed: 0,Missing Values,Percent
annual_income,181,12.38
industry,134,9.17
lead_source,128,8.76
employment_status,100,6.84
location,63,4.31


The following columns have some missing values: `annual_income`, `industry`, `lead_source`, `employment_status`, and `location`.

Impute missing categorical and numerical feature with 'NA' and 0, respectively.

In [7]:
# Missing data columns
missing_cols = df_missing.index

# Impute with 'NA' or 0.0
for col in missing_cols:
    if df[col].dtype.kind == 'O':
        df[col] = df[col].fillna('NA')
    else:
        df[col] = df[col].fillna(0.0)

In [8]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [9]:
df.tail()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1
1461,organic_search,finance,3,92855.0,student,north_america,3,0.41,1


Check for duplicates

In [10]:
df.duplicated().sum()

np.int64(0)

No row is duplicated.

## 3. Split the data

The data will be split into training, validation, and test sets with a 60-20-20 split.

In [11]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [12]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

Reset indices

In [13]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

Extract target

In [14]:
target = 'converted'
y_train = df_train[target].values
y_val = df_val[target].values
y_test = df_test[target].values

Delete target from among features

In [15]:
del df_train[target]
del df_val[target]
del df_test[target]

In [20]:
print('Training column names:')
print(df_train.columns.tolist())
print()

print('Validation column names:')
print(df_val.columns.tolist())
print()
print('Test column names:')
print(df_test.columns.tolist())

Training column names:
['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score']

Validation column names:
['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score']

Test column names:
['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score']


## 4. ROC AUC feature importance

In [24]:
numeric_train = df_train.select_dtypes(include=['number'])

auc_results = []
for f in numeric_train.columns:
    scores = df_train[f].values
    auc = roc_auc_score(y_train, scores)

    if auc < 0.5:
        scores = -scores
        auc = roc_auc_score(y_train, scores)

    auc_results.append((f, auc))

auc_df = pd.DataFrame(auc_results, columns=['feature', 'auc']).sort_values('auc', ascending=False).reset_index(drop=True)

print("AUC per numerical feature (training set):")
auc_df

AUC per numerical feature (training set):


Unnamed: 0,feature,auc
0,number_of_courses_viewed,0.763568
1,interaction_count,0.73827
2,lead_score,0.614499
3,annual_income,0.551958
