In [47]:
# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# disable pop_up display of graphs to display them in notebook.
%matplotlib inline

from sklearn.model_selection import train_test_split

# correlation scorer
from sklearn.metrics import mutual_info_score

# vectorizer for dictionaries
from sklearn.feature_extraction import DictVectorizer

# Linear Regression Model
from sklearn.linear_model import LogisticRegression

In [2]:
data = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
!wget $data

--2025-10-14 13:54:37--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8002::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.1’


2025-10-14 13:54:39 (703 KB/s) - ‘course_lead_scoring.csv.1’ saved [80876/80876]



In [3]:
# read in the data and save in a DataFrame
df = pd.read_csv("course_lead_scoring.csv")

In [4]:
# get the num rows,cols
df.shape

(1462, 9)

In [6]:
# view the first 5 records
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [7]:
# get summary info about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


## Data Preparation

### Missing Value Handling
* Check if the missing values are presented in the features.
* If there are missing values:
    * For caterogiral features, replace them with 'NA'
    * For numerical features, replace with with 0.0 

In [9]:
# get count of missing values per feature
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [10]:
# separate continuous and categorical features
num_cols = list(df.columns[df.dtypes != 'object'])
cat_cols = list(df.columns[df.dtypes == 'object'])

In [None]:
# handle missing values for continuous features
for n in df[num_cols]:
    df[n] = df[n].fillna(0)

In [13]:
# handle missing values for categorical features
for c in df[cat_cols]:
    df[c] = df[c].fillna('NA')

In [21]:
df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

### Question 1

What is the most frequent observation (mode) for the column `industry`?
- retail


In [14]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?
- annual_income and interaction_count w/ `0.027036`

In [20]:
df[num_cols[:4]].corr(method='pearson')

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


## Split the data
* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value `y` is not in your dataframe.

In [22]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [23]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

y_train = df_train['converted'].values
y_test = df_test['converted'].values
y_val = df_val['converted'].values

df_train.drop(['converted'], axis=1, inplace=True)
df_test.drop(['converted'], axis=1, inplace=True)
df_val.drop(['converted'], axis=1, inplace=True)

### Question 3

* Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of variable has the biggest mutual information score?
- lead_source

In [31]:
def calculate_mi(series):
    """
    Calculate the mutual information score between features and the target: converted.
    
    Params:
        series: the feature vector 
    Returns:
        float: mutual information score rounded to 2 decimal points
    """
    return round(mutual_info_score(series, df_full_train['converted']), 2)

In [32]:
# get mutual information score between categorical features and target
df_mi = df_full_train[cat_cols].apply(calculate_mi)
# sort mi scores and create a DataFrame from them
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

df_mi

### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
- 0.74


#### One-hot encoding

In [41]:
# turn each row in the DataFrame into a dictionary
train_dict = df_train[cat_cols + num_cols[:4]].to_dict(orient='records')
train_dict[0]

{'lead_source': 'paid_ads',
 'industry': 'retail',
 'employment_status': 'student',
 'location': 'middle_east',
 'number_of_courses_viewed': 0,
 'annual_income': 58472.0,
 'interaction_count': 5,
 'lead_score': 0.03}

In [48]:
# turn each row in the DataFrame into a dictionary
val_dict = df_val[cat_cols + num_cols[:4]].to_dict(orient='records')
val_dict[0]

{'lead_source': 'paid_ads',
 'industry': 'healthcare',
 'employment_status': 'unemployed',
 'location': 'europe',
 'number_of_courses_viewed': 3,
 'annual_income': 52220.0,
 'interaction_count': 1,
 'lead_score': 0.07}

In [64]:
# sparse=False, return standard numpy array with 0's and 1's
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
dv.fit(val_dict)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [74]:
X_train = dv.transform(train_dict)
X_val = dv.transform(val_dict)


#### Train Logisitic Regression

In [75]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# train on training data
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [69]:
# Training set
y_pred_train = model.predict_proba(X_train)[:,1] # get only class 1 predictions
# set threshold to >= 0.5 and get classification decisions
converted_train = (y_pred_train >= 0.5)

In [76]:
# Validation set
y_pred_val = model.predict_proba(X_val)[:,1] # get only class 1 predictions
# set threshold to >= 0.5 and get classification decisions
converted_val = (y_pred_val >= 0.5)

#### Get Accuracy

In [82]:
# convert decisions to integers to compare with actual
converted_train.astype(int)
converted_val.astype(int)

# check how many matches we have to determine accuracy
train_ac = (y_train == converted_train.T).mean()
val_ac = (y_val == converted_val).mean()

print(f"training accuracy: {train_ac:.2f}\nvalidation_accuracy: {val_ac:.2f}")

training accuracy: 0.50
validation_accuracy: 0.70


### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model using the same features and parameters as in Q4 (without rounding).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?
- `'industry' = 0.0034129692832763903`

In [113]:
def log_regression(df_train, df_val, cols, y_train, y_val, C=1.0):
    train_dict = df_train[cols].to_dict(orient='records')
    val_dict = df_val[cols].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    X_train = dv.transform(train_dict)
    dv.fit(val_dict)
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred_val = model.predict_proba(X_val)[:,1]
    converted_val = (y_pred_val >= 0.5)

    val_ac = (y_val == converted_val.astype(int)).mean()

    return val_ac

In [101]:
accuracy = {}
all_cols = cat_cols+num_cols[:4]
og_accuracy = log_regression(df_train, df_val, all_cols, y_train, y_val)

In [110]:
og_accuracy

np.float64(0.6996587030716723)

In [109]:
for label in all_cols:
    temp = [x for x in all_cols if x != label]
    accuracy[label] = abs(og_accuracy - log_regression(df_train, df_val, temp, y_train, y_val))

accuracy

{'lead_source': np.float64(0.0068259385665528916),
 'employment_status': np.float64(0.0068259385665528916),
 'industry': np.float64(0.0034129692832763903),
 'location': np.float64(0.0034129692832765013),
 'number_of_courses_viewed': np.float64(0.13651877133105794),
 'annual_income': np.float64(0.11945392491467577),
 'interaction_count': np.float64(0.14334470989761094),
 'lead_score': np.float64(0.0068259385665528916)}

In [86]:
coefs = model.coef_.T
labels = dv.get_feature_names_out()
coef_df = pd.DataFrame(coefs, index=labels, columns=['coefficients']).sort_values(by='coefficients', ascending=False)

coef_df

Unnamed: 0,coefficients
number_of_courses_viewed,0.453753
interaction_count,0.311339
lead_source=referral,0.07953
lead_score,0.051201
industry=education,0.04936
employment_status=employed,0.03391
lead_source=NA,0.020151
employment_status=student,0.011524
location=europe,0.008264
location=middle_east,0.005586


### Question 6

* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter `C`: `[0.01, 0.1, 1, 10, 100]`.
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation set?
- 0.01


In [None]:
reg = [0.01, 0.1, 1, 10, 100]
reg_accuracy = {}
for c in reg:
    c_val = log_regression(df_train, df_val, all_cols, y_train, y_val, c)
    reg_accuracy[c] = round(c_val, 3)

reg_accuracy

{0.01: np.float64(0.7),
 0.1: np.float64(0.7),
 1: np.float64(0.7),
 10: np.float64(0.7),
 100: np.float64(0.7)}