In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-12 22:48:13--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: 'course_lead_scoring.csv'


2025-10-12 22:48:15 (397 KB/s) - 'course_lead_scoring.csv' saved [80876/80876]



In [3]:
import pandas as pd
import numpy as np


In [4]:
df=pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [8]:
print(df.isnull().sum())

# Identify categorical and numerical columns
cat_cols = df.select_dtypes(include='object').columns
num_cols = df.select_dtypes(exclude='object').columns

# Replace missing values
df[cat_cols] = df[cat_cols].fillna('NA')
df[num_cols] = df[num_cols].fillna(0.0)

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [11]:
df['industry'].value_counts()

retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: industry, dtype: int64

In [16]:
corr = df.corr()

pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count'),
]

for a, b in pairs:
    print(f"{a} and {b}: {corr.loc[a, b]:.4f}")


interaction_count and lead_score: 0.0099
number_of_courses_viewed and lead_score: -0.0049
number_of_courses_viewed and interaction_count: -0.0236
annual_income and interaction_count: 0.0270


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif

In [18]:
y = df['converted']
X = df.drop('converted', axis=1)

# first split: train (60%) vs temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# split temp into val (20%) and test (20%) => split temp equally
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("\nDataset sizes: train, val, test =", len(X_train), len(X_val), len(X_test))


Dataset sizes: train, val, test = 877 292 293


In [19]:

# ---------- Q3: mutual information (on training set) for categorical variables ----------
# We'll compute MI for categorical variables only (train set).
# Convert training categorical columns to codes for mutual_info_classif
cat_cols_train = [c for c in cat_cols if c in X_train.columns]

# For MI, we need numeric arrays. Use pandas factorize for each categorical column.
mi_scores = {}
for col in cat_cols_train:
    # factorize
    codes, uniques = pd.factorize(X_train[col])
    mi = mutual_info_classif(codes.reshape(-1,1), y_train, discrete_features=True, random_state=42)
    mi_scores[col] = float(mi[0])

# Round scores to 2 decimals and show sorted
mi_sorted = sorted(mi_scores.items(), key=lambda x: x[1], reverse=True)
print("\nQ3: Mutual information scores (train set) for categorical variables (rounded to 2 decimals):")
for col, score in mi_sorted:
    print(f"  {col}: {round(score,2)}")

top_mi_var = mi_sorted[0][0] if mi_sorted else None
print("  -> variable with biggest MI:", top_mi_var)

# ---------- Utility: One-hot encode categorical columns for model training ----------
def one_hot_encode_fit_transform(X_train, X_other, categorical_columns):
    """
    Fit OneHotEncoder on X_train[categorical_columns] then transform X_train and X_other.
    Returns (X_train_transformed_df, X_other_transformed_df, ohe)
    """
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
    X_train_cat = ohe.fit_transform(X_train[categorical_columns])
    X_other_cat = ohe.transform(X_other[categorical_columns])

    # build DataFrames for cat features
    cat_feature_names = ohe.get_feature_names_out(categorical_columns)

    X_train_cat_df = pd.DataFrame(X_train_cat, columns=cat_feature_names, index=X_train.index)
    X_other_cat_df = pd.DataFrame(X_other_cat, columns=cat_feature_names, index=X_other.index)

    # drop original categorical columns from X and concat encoded
    X_train_num = X_train.drop(columns=categorical_columns)
    X_other_num = X_other.drop(columns=categorical_columns)

    X_train_final = pd.concat([X_train_num.reset_index(drop=True), X_train_cat_df.reset_index(drop=True)], axis=1)
    X_other_final = pd.concat([X_other_num.reset_index(drop=True), X_other_cat_df.reset_index(drop=True)], axis=1)

    return X_train_final, X_other_final, ohe

# Prepare feature matrix using one-hot encoding of categorical features
categorical_columns_for_model = [c for c in cat_cols if c in X_train.columns]  # use all categorical columns in data
X_train_enc, X_val_enc, ohe = one_hot_encode_fit_transform(X_train, X_val, categorical_columns_for_model)

# For testing later, also transform test set
X_train_enc_full = X_train_enc.copy()
X_test_enc = pd.concat([X_test.drop(columns=categorical_columns_for_model).reset_index(drop=True),
                        pd.DataFrame(ohe.transform(X_test[categorical_columns_for_model]), columns=ohe.get_feature_names_out(categorical_columns_for_model)).reset_index(drop=True)],
                       axis=1)

# Ensure numeric columns are floats
X_train_enc = X_train_enc.astype(float)
X_val_enc = X_val_enc.astype(float)


Q3: Mutual information scores (train set) for categorical variables (rounded to 2 decimals):
  lead_source: 0.03
  industry: 0.01
  employment_status: 0.01
  location: 0.0
  -> variable with biggest MI: lead_source


In [20]:
# ---------- Q4: logistic regression training and val accuracy ----------
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)
y_val_pred = model.predict(X_val_enc)
acc_val = accuracy_score(y_val, y_val_pred)
print(f"\nQ4: Validation accuracy = {acc_val:.4f} -> rounded to 2 decimals: {round(acc_val,2)}")



Q4: Validation accuracy = 0.6815 -> rounded to 2 decimals: 0.68


In [21]:

# ---------- Q5: feature-elimination experiment ----------
# We'll consider the following features to possibly drop:
# For this question they asked about 'industry', 'employment_status', 'lead_score'
features_to_test = ['industry', 'employment_status', 'lead_score']

# Build baseline model using the same pipeline (we already have baseline acc_val)
baseline_acc = acc_val

# For this experiment we want to remove each feature from the dataset (both numeric or categorical)
differences = {}
for feature in features_to_test:
    # Create copies of X_train and X_val with the feature removed
    X_train_tmp = X_train.copy()
    X_val_tmp = X_val.copy()
    if feature in X_train_tmp.columns:
        X_train_tmp = X_train_tmp.drop(columns=[feature])
        X_val_tmp = X_val_tmp.drop(columns=[feature])
    else:
        # if not present, skip
        print(f"Feature {feature} not found in X_train columns; skipping")
        continue

    # Re-encode after drop
    cat_cols_tmp = [c for c in categorical_columns_for_model if c in X_train_tmp.columns]
    X_train_tmp_enc, X_val_tmp_enc, ohe_tmp = one_hot_encode_fit_transform(X_train_tmp, X_val_tmp, cat_cols_tmp)

    # Train model with same params
    model_tmp = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_tmp.fit(X_train_tmp_enc, y_train)
    y_val_tmp_pred = model_tmp.predict(X_val_tmp_enc)
    acc_tmp = accuracy_score(y_val, y_val_tmp_pred)
    diff = baseline_acc - acc_tmp
    differences[feature] = diff
    print(f"Q5: Removing {feature} -> val acc: {acc_tmp:.4f}; diff baseline-wo = {diff:.6f}")

# Which feature has smallest difference?
smallest_feature = min(differences.items(), key=lambda x: abs(x[1]))[0]
print("  -> feature with smallest absolute difference:", smallest_feature)

Q5: Removing industry -> val acc: 0.6884; diff baseline-wo = -0.006849
Q5: Removing employment_status -> val acc: 0.6815; diff baseline-wo = 0.000000
Q5: Removing lead_score -> val acc: 0.6747; diff baseline-wo = 0.006849
  -> feature with smallest absolute difference: employment_status


In [22]:
# ---------- Q6: try different C values ----------
C_values = [0.01, 0.1, 1, 10, 100]
results_C = {}
for C in C_values:
    model_c = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_c.fit(X_train_enc, y_train)
    y_val_c = model_c.predict(X_val_enc)
    acc_c = accuracy_score(y_val, y_val_c)
    results_C[C] = acc_c
    print(f"Q6: C={C} -> val accuracy = {acc_c:.3f}")

# Find best C (if tie, smallest C as instruction)
best_C = min([C for C, acc in results_C.items() if acc == max(results_C.values())])
print("  -> Best C:", best_C)

Q6: C=0.01 -> val accuracy = 0.688
Q6: C=0.1 -> val accuracy = 0.682
Q6: C=1 -> val accuracy = 0.682
Q6: C=10 -> val accuracy = 0.682
Q6: C=100 -> val accuracy = 0.682
  -> Best C: 0.01
