In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [4]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-14 04:25:32--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8003::154, 2606:50c0:8001::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... failed: Network is unreachable.
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... failed: Network is unreachable.
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... failed: Network is unreachable.
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... failed: Network is unreachable.
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.1’


2025-10-14 04:26

In [5]:
url = 'course_lead_scoring.csv'
df = pd.read_csv(url)
print(df.head())

    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  


In [6]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [7]:
df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


### Data preparation
##### Check if the missing values are presented in the features.
##### If there are missing values:
###### For categorical features, replace them with 'NA'
###### For numerical features, replace with with 0.0

In [9]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [11]:
df.lead_source = df.lead_source.fillna('NA')
df.industry = df.industry.fillna('NA')
df.annual_income = df.annual_income.fillna(0.0)
df.employment_status = df.employment_status.fillna('NA')
df.location = df.location.fillna('NA')

## Question 1
##### What is the most frequent observation (mode) for the column industry?

In [27]:
df.industry.value_counts(normalize=True)

industry
retail           0.138851
finance          0.136799
other            0.135431
healthcare       0.127907
education        0.127907
technology       0.122435
manufacturing    0.119015
NA               0.091655
Name: proportion, dtype: float64

### Question 2
##### Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

##### What are the two features that have the biggest correlation?



In [61]:
df[numerical].corrwith(df.converted)

number_of_courses_viewed    0.435914
annual_income               0.053131
interaction_count           0.374573
lead_score                  0.193673
dtype: float64

### Split the data
##### Split your data in train/val/test sets with 60%/20%/20% distribution.
##### Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
##### Make sure that the target value converted is not in your dataframe.


In [63]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_test), len(df_val)


(876, 293, 293)

In [64]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [65]:
y_train = df_train.converted.values
y_test = df_test.converted.values
y_val = df_val.converted.values


In [66]:
del df_train['converted']
del df_test['converted']
del df_val['converted']

In [67]:
numerical = ['number_of_courses_viewed', 'annual_income','interaction_count', 'lead_score']


In [68]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']

In [69]:
df_full_train[categorical].nunique()

lead_source          6
industry             8
employment_status    5
location             8
dtype: int64

## Question 3
##### Calculate the mutual information score between converted and other categorical variables in the dataset. Use the training set only.
##### Round the scores to 2 decimals using round(score, 2).
##### Which of these variables has the biggest mutual information score?

In [70]:
from sklearn.metrics import mutual_info_score

In [75]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [79]:
df_full_train[categorical].apply(mutual_info_converted_score).sort_values(ascending=False)

lead_source          0.025665
employment_status    0.013258
industry             0.011685
location             0.002253
dtype: float64

### Question 4
##### Now let's train a logistic regression.
##### Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
##### Fit the model on the training dataset.
##### To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
##### model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
##### Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
##### What accuracy did you get?

In [108]:
from sklearn.feature_extraction import DictVectorizer

In [109]:
dv = DictVectorizer(sparse=False)

In [110]:
train_dicts = df_train[categorical+numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

In [111]:
val_dicts = df_val[categorical+numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [112]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

### Question 4
##### Now let's train a logistic regression.
##### Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
##### Fit the model on the training dataset.
##### To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
##### model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
##### Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
##### What accuracy did you get?

In [90]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [91]:
z = np.linspace(-5, 5, 51)

In [95]:
from sklearn.linear_model import LogisticRegression

In [98]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [100]:
model.coef_.round(2)

array([[-0.  , -0.01,  0.03,  0.  ,  0.01, -0.1 , -0.02,  0.05, -0.02,
        -0.01, -0.  , -0.01, -0.03, -0.02,  0.31,  0.05,  0.02, -0.01,
        -0.01, -0.12,  0.08, -0.03,  0.  , -0.01, -0.01, -0.01,  0.01,
         0.01, -0.03, -0.03,  0.45]])

In [102]:
model.predict(X_train)

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,

In [113]:
y_pred = model.predict_proba(X_val)[:, 1]

In [127]:
converted_decision = (y_pred >= 0.7)

In [129]:
(y_val == converted_decision).mean().round(2)

np.float64(0.74)

In [132]:
df_pred = pd.DataFrame()
df_pred['Probability'] = y_pred
df_pred['prediction'] = converted_decision.astype(int)
df_pred['actual'] = y_val
df_pred['correct'] = df_pred['prediction'] == df_pred['actual'] 

In [133]:
df_pred

Unnamed: 0,Probability,prediction,actual,correct
0,0.611922,0,0,True
1,0.799826,1,1,True
2,0.530213,0,0,True
3,0.471315,0,0,True
4,0.570661,0,0,True
...,...,...,...,...
288,0.419342,0,0,True
289,0.710539,1,1,True
290,0.418185,0,0,True
291,0.744835,1,1,True


In [135]:
df_pred.correct.mean().round(2)

np.float64(0.74)

### Question 5
##### Let's find the least useful feature using the feature elimination technique.
##### Train a model using the same features and parameters as in Q4 (without rounding).
##### Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
##### For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
##### Which of following feature has the smallest difference?

In [142]:
features_to_test = ['industry', 'employment_status', 'lead_score']


In [146]:
from sklearn.metrics import accuracy_score

In [151]:
C_values = [0.01, 0.1, 1, 10, 100]
results = {}

for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train, y_train)
    
    y_val_pred_reg = model_reg.predict(X_val)
    accuracy_reg = accuracy_score(y_val, y_val_pred_reg)
    accuracy_reg_rounded = round(accuracy_reg, 3)
    
    results[C] = accuracy_reg_rounded
    
    print(f"  C = {C}: {accuracy_reg_rounded:.3f}")


  C = 0.01: 0.700
  C = 0.1: 0.700
  C = 1: 0.700
  C = 10: 0.700
  C = 100: 0.700
