In [14]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [16]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [17]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [18]:
# Select categorical columns (object or category)
categorical = df.select_dtypes(include=['object', 'category'])

# Select numerical columns (int or float)
numerical = df.select_dtypes(include=['number'])

# Fill missing values
categorical = categorical.fillna('NA')
numerical = numerical.fillna(0.0)

# (Optional) combine back
df[categorical.columns] = categorical
df[numerical.columns] = numerical

In [19]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [20]:
### Question 1 **What is the most frequent observation (mode) for the column `industry`?** 


df.industry.mode()

0    retail
Name: industry, dtype: object

In [21]:
# Create the correlation matrix
corr_matrix = numerical.corr()

corr_pairs = corr_matrix.unstack()

# Remove self-correlations
corr_pairs = corr_pairs[corr_pairs < 1]

# Sort correlations from highest to lowest
corr_pairs = corr_pairs.sort_values(ascending=False)

# Show the top pair
corr_pairs.head()


number_of_courses_viewed  converted                   0.435914
converted                 number_of_courses_viewed    0.435914
                          interaction_count           0.374573
interaction_count         converted                   0.374573
converted                 lead_score                  0.193673
dtype: float64

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
df_train_full, df_test = train_test_split(df, test_size = 0.2, random_state = 42)
df_train, df_val = train_test_split(df_train_full, test_size = 0.25, random_state = 42)

In [24]:
y_train = df_train['converted']
y_val = df_val['converted']

In [25]:
del df_train['converted']
del df_val['converted']

In [26]:
X_train = df_train

In [27]:
df_train_full.converted.value_counts()

converted
1    710
0    459
Name: count, dtype: int64

In [28]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [29]:
categorical = ['lead_source', 'industry',	'employment_status', 'location']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [31]:
from sklearn.metrics import mutual_info_score

# Step 1: Define a function to compute MI between each categorical feature and the target
def calculate_mi(series):
    return mutual_info_score(series, y_train)  # compares category vs target

# Step 2: Apply this to each categorical column
mi_scores = X_train[categorical].apply(calculate_mi)

# Step 3: Sort and display
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores.round(5)


lead_source          0.03540
employment_status    0.01294
industry             0.01157
location             0.00446
dtype: float64

In [32]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')
val_dict = df_val[categorical + numerical].to_dict(orient='records')

In [33]:
train_dict[0]

{'lead_source': 'paid_ads',
 'industry': 'retail',
 'employment_status': 'student',
 'location': 'middle_east',
 'number_of_courses_viewed': 0,
 'annual_income': 58472.0,
 'interaction_count': 5,
 'lead_score': 0.03}

In [34]:
 from sklearn.feature_extraction import DictVectorizer

In [35]:
 dv = DictVectorizer(sparse=False)
 dv.fit(train_dict)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [36]:
X_train = dv.transform(train_dict)

In [39]:
X_train[0]

array([5.8472e+04, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 5.0000e+00,
       3.0000e-02, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00])

In [40]:
 from sklearn.linear_model import LogisticRegression

In [41]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [42]:
X_val = dv.transform(val_dict) 

In [49]:
y_pred = model.predict(X_val)

In [50]:
from sklearn.metrics import accuracy_score

In [51]:
acc = accuracy_score(y_val, y_pred)

In [52]:
acc = round(acc, 2)

print("Validation Accuracy:", acc)

Validation Accuracy: 0.7


In [53]:
all_features = categorical + numerical

# Define helper function to train & evaluate
def train_and_evaluate(feature_list):
    # Prepare training and validation dictionaries
    train_dict = df_train[feature_list].to_dict(orient='records')
    val_dict = df_val[feature_list].to_dict(orient='records')
    
    # Encode features
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)
    X_val = dv.transform(val_dict)
    
    # Train logistic regression model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict and calculate accuracy
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    return acc

# Baseline accuracy with all features
acc_full = train_and_evaluate(all_features)
print(f"Baseline accuracy (all features): {acc_full:.4f}")

# Test accuracy drop when removing each feature
accuracy_drop = {}

for feature in all_features:
    reduced_features = [f for f in all_features if f != feature]
    acc_without = train_and_evaluate(reduced_features)
    accuracy_drop[feature] = acc_full - acc_without

# Sort and show results
diffs = pd.Series(accuracy_drop).sort_values()
print("\nAccuracy difference after removing each feature:")
print(diffs)

Identify least useful feature
least_useful = diffs.idxmin()
print(f"\nLeast useful feature: '{least_useful}'")

Baseline accuracy (all features): 0.6997

Accuracy difference after removing each feature:
annual_income              -0.153584
location                   -0.010239
lead_score                 -0.006826
lead_source                -0.003413
industry                    0.000000
employment_status           0.003413
number_of_courses_viewed    0.143345
interaction_count           0.143345
dtype: float64

Least useful feature: 'annual_income'


In [56]:
results = {}

for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    results[c] = acc
    print(f"C={c} -> Validation Accuracy: {acc}")

print("\nSummary of accuracies:")
for c, acc in results.items():
    print(f"C={c}: {acc}")


C=0.01 -> Validation Accuracy: 0.6996587030716723
C=0.1 -> Validation Accuracy: 0.6996587030716723
C=1 -> Validation Accuracy: 0.6996587030716723
C=10 -> Validation Accuracy: 0.6996587030716723
C=100 -> Validation Accuracy: 0.6996587030716723

Summary of accuracies:
C=0.01: 0.6996587030716723
C=0.1: 0.6996587030716723
C=1: 0.6996587030716723
C=10: 0.6996587030716723
C=100: 0.6996587030716723
