<a href="https://colab.research.google.com/github/gaoqisheng123-bot/machine-learning-zoomcamp-homework/blob/main/machine_learning_zoomcamp_homework_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [96]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv


--2025-10-12 14:01:48--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.2’


2025-10-12 14:01:48 (9.43 MB/s) - ‘course_lead_scoring.csv.2’ saved [80876/80876]



In [97]:
df = pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [98]:
print(df.isnull().sum())

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [99]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_columns:
    df[c] = df[c].fillna('NA')

numerical_columns = list(df.select_dtypes(include=['number']).columns)
for n in numerical_columns:
    df[n] = df[n].fillna(0.0)

print(df.isnull().sum())

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [100]:
#1
df['industry'].mode()[0]

'retail'

In [101]:
#2
numerical_df = df.select_dtypes(include=['number'])
corr_matrix = numerical_df.corr()
print(corr_matrix)

                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  


In [102]:
from sklearn.model_selection import train_test_split

In [103]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [104]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [105]:
from sklearn.metrics import mutual_info_score

In [106]:
#3
from sklearn.metrics import mutual_info_score
categorical_cols = df_train.select_dtypes(include=['object']).columns
def mi_score(series):
    return mutual_info_score(series, y_train)

mi = round(df_train[categorical_cols].apply(mi_score),2)
mi.sort_values(ascending=False)
print(mi)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64


In [107]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

categorical_cols = df_train.select_dtypes(include=['object']).columns
numerical_cols = df_train.select_dtypes(include=['number']).columns

train_dict = df_train[list(categorical_cols) + list(numerical_cols)].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[list(categorical_cols) + list(numerical_cols)].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [108]:
#4

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
base_acc = accuracy_score(y_val, y_pred)
print(round(base_acc, 2))


0.7


In [109]:
#5

feature_scores = {}

for col in categorical_cols.tolist() + numerical_cols.tolist():
    reduced_cols = [c for c in (categorical_cols.tolist() + numerical_cols.tolist()) if c != col]

    train_dict = df_train[reduced_cols].to_dict(orient='records')
    val_dict = df_val[reduced_cols].to_dict(orient='records')

    dv_temp = DictVectorizer(sparse=False)
    X_train_temp = dv_temp.fit_transform(train_dict)
    X_val_temp = dv_temp.transform(val_dict)

    model_temp = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_temp.fit(X_train_temp, y_train)

    y_pred_temp = model_temp.predict(X_val_temp)
    acc = accuracy_score(y_val, y_pred_temp)

    diff = base_acc - acc
    feature_scores[col] = diff
    print(f"{col:30s} diff: {diff:.4f}")

least_impact_feature = min(feature_scores, key=feature_scores.get)
print(least_impact_feature)

lead_source                    diff: -0.0034
industry                       diff: 0.0000
employment_status              diff: 0.0034
location                       diff: -0.0102
number_of_courses_viewed       diff: 0.1433
annual_income                  diff: -0.1536
interaction_count              diff: 0.1433
lead_score                     diff: -0.0068
annual_income


In [110]:
#6
dv = DictVectorizer(sparse=False)

categorical_cols = df_train.select_dtypes(include=['object']).columns
numerical_cols = df_train.select_dtypes(include=['number']).columns

train_dict = df_train[list(categorical_cols) + list(numerical_cols)].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[list(categorical_cols) + list(numerical_cols)].to_dict(orient='records')
X_val = dv.transform(val_dict)

C_values = [0.01, 0.1, 1, 10, 100]

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    acc = round(accuracy_score(y_val, y_pred),3)

    print(f"C={C}  {acc:.3f}")

C=0.01  0.700
C=0.1  0.700
C=1  0.700
C=10  0.700
C=100  0.700
