In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-16 15:39:28--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-16 15:39:28 (13.6 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [2]:
df = pd.read_csv("course_lead_scoring.csv")

## Data preparation

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [4]:
num_cols = df.select_dtypes(include=['number']).columns.to_list()
cat_cols = df.select_dtypes(exclude=['number']).columns.to_list()
len(num_cols) + len(cat_cols) == len(df.columns)

True

In [5]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [6]:
df[num_cols] = df[num_cols].fillna(0)
df[cat_cols] = df[cat_cols].fillna('NA')

## Question 1

In [7]:
print(f"Answer to Q1 is: {df.industry.mode()[0]}")

Answer to Q1 is: retail


## Question 2

In [8]:
corr = df[num_cols].corr()

mask = np.tril(np.ones_like(corr, dtype=bool))
corr_upper = corr.mask(mask)

corr_df = (
    corr_upper
    .stack()
    .reset_index()
    .rename(columns={"level_0": "feature_1", "level_1": "feature_2", 0: "correlation"})
)

pairs = [
    ("interaction_count", "lead_score"),
    ("number_of_courses_viewed", "lead_score"),
    ("number_of_courses_viewed", "interaction_count"),
    ("annual_income", "interaction_count"),
]

corr_filtered = corr_df[
    corr_df.apply(
        lambda row: (row["feature_1"], row["feature_2"]) in pairs
        or (row["feature_2"], row["feature_1"]) in pairs,
        axis=1
    )
].reset_index(drop=True)

corr_filtered['correlation'] = corr_filtered['correlation'].abs() 
corr_max = corr_filtered[corr_filtered['correlation'] == corr_filtered['correlation'].max()]

print(f"Answer to Q2 is: {corr_max.feature_1.iloc[0]} and {corr_max.feature_2.iloc[0]}")

Answer to Q2 is: annual_income and interaction_count


## Split the data

In [9]:
df_full_train, X_test = train_test_split(df, test_size=0.2, random_state=42)
X_train, X_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

for d in (X_train, X_val, X_test):
    d.reset_index(drop=True, inplace=True)

y_train = X_train.pop('converted').values
y_val = X_val.pop('converted').values
y_test = X_test.pop('converted').values

## Question 3

In [10]:
mi = X_train[cat_cols].apply(lambda x: mutual_info_score(x, y_train))
mi = mi.sort_values(ascending=False)
print(f"Answer to Q3 is: {mi.idxmax()}")

Answer to Q3 is: lead_source


## Question 4

In [11]:
num_cols.remove('converted')

In [43]:
features_list = cat_cols + num_cols

In [77]:
def train_and_eval(X_train, X_val, y_train, y_val, features_list, C=1.0):
    dv = DictVectorizer(sparse=False)

    train_dict = X_train[features_list].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = X_val[features_list].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    churn_decision = (y_pred >= 0.5)
    accuracy = (y_val == churn_decision).mean()
    return accuracy


In [79]:
base_accuracy = train_and_eval(X_train, X_val, y_train, y_val, features_list)
print(f"Answer to Q4 is: {round(base_accuracy, 2)}")

Answer to Q4 is: 0.7


## Question 5

In [80]:
results = {}

for feature in features_list:
    X_train_subset = X_train.drop(columns=[feature])
    X_val_subset = X_val.drop(columns=[feature])

    features_subset = [f for f in features_list if f != feature]

    acc = train_and_eval(X_train_subset, X_val_subset, y_train, y_val, features_subset)
    results[feature] = float(acc)

subset_keys = ['industry', 'employment_status', 'lead_score']

best_feature, best_value = max(
    ((k, abs(results[k] - base_accuracy)) for k in subset_keys if k in results),
    key=lambda x: x[1]
)

print(f"Answer to Q5 is: {best_feature}")

Answer to Q5 is: lead_score


## Question 6

In [86]:
C_values = [0.01, 0.1, 1, 10, 100]
results = {}

for C in C_values:
    acc = train_and_eval(X_train, X_val, y_train, y_val, features_list, C=C)
    results[C] = round(acc, 3)

best_C = min(
    [C for C, acc in results.items() if acc == max(results.values())]
)

print(f"Answer to Q6 is: {best_C}")

Answer to Q6 is: 0.01
