In [1]:
import numpy as np
import pandas as pd

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-09-23 17:12:31--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: 'course_lead_scoring.csv'

     0K .......... .......... .......... .......... .......... 63%  367K 0s
    50K .......... .......... ........                        100% 6.58M=0.1s

2025-09-23 17:12:32 (561 KB/s) - 'course_lead_scoring.csv' saved [80876/80876]



In [16]:
df = pd.read_csv('course_lead_scoring.csv')

In [17]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [18]:
numerical = ['number_of_courses_viewed', 'annual_income','interaction_count','lead_score']
categorical = list(df.columns[df.dtypes == 'object'])
categorical

['lead_source', 'industry', 'employment_status', 'location']

In [19]:
df[categorical].isnull().sum()

lead_source          128
industry             134
employment_status    100
location              63
dtype: int64

In [20]:
df[categorical] = df[categorical].fillna('NA')
df[numerical]= df[numerical].fillna(0.0)

df[numerical].isnull().sum()

number_of_courses_viewed    0
annual_income               0
interaction_count           0
lead_score                  0
dtype: int64

### 1.

In [21]:
df.industry.mode()

0    retail
Name: industry, dtype: object

### 2. Correlation matrix

In [22]:
df[numerical].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [23]:
from sklearn.model_selection import train_test_split

In [30]:
y = df.converted
df_feat = df[numerical + categorical].copy()
df_train_val, df_test, y_train_val, y_test = train_test_split(df_feat,y,test_size=0.2, random_state=42)
df_train, df_val,y_train, y_val = train_test_split(df_train_val, y_train_val,test_size=0.25, random_state=42)


### 3. 

In [32]:
from sklearn.metrics import mutual_info_score

In [34]:
for col in categorical:
    mut_score = mutual_info_score(df_train[col], y_train)
    print(f"Feature: {col}  Score: {mut_score}")

Feature: lead_source  Score: 0.03539624379726594
Feature: industry  Score: 0.011574521435657112
Feature: employment_status  Score: 0.012937677269442782
Feature: location  Score: 0.004464157884038034


### 4.

In [35]:
from sklearn.feature_extraction import DictVectorizer

In [36]:
train_dict = df_train.to_dict(orient='records')

train_dict[0]

{'number_of_courses_viewed': 0,
 'annual_income': 58472.0,
 'interaction_count': 5,
 'lead_score': 0.03,
 'lead_source': 'paid_ads',
 'industry': 'retail',
 'employment_status': 'student',
 'location': 'middle_east'}

In [37]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

In [38]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [39]:
from sklearn.linear_model import LogisticRegression

In [49]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000,random_state=42)

In [50]:
model.fit(X_train, y_train)

In [51]:
X_val = dv.transform(df_val.to_dict(orient='records'))

In [52]:
round(model.score(X_val, y_val),2)

0.7

### 5.

In [56]:
list(set(['aa','bb','cc']) - set(['a']))

['cc', 'bb', 'aa']

In [66]:
def exlude_featur_split_train(feature, C=1.0):
    y = df.converted
    final_feat_list = list(set(numerical + categorical) - set([feature]))
    df_feat = df[final_feat_list].copy()
    df_train_val, df_test, y_train_val, y_test = train_test_split(df_feat,y,test_size=0.2, random_state=42)
    df_train, df_val,y_train, y_val = train_test_split(df_train_val, y_train_val,test_size=0.25, random_state=42)

    train_dict = df_train.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000,random_state=42)
    model.fit(X_train, y_train)
    X_val = dv.transform(df_val.to_dict(orient='records'))

    return model.score(X_val, y_val)

In [67]:
original_acc = exlude_featur_split_train('na')
original_acc

0.6996587030716723

In [68]:
columns = categorical + numerical

smallest_score = {"feature": ' ', "score": 9999}
for col in columns:
    score = exlude_featur_split_train(col)
    score_difference = score-original_acc
    if abs(score_difference) < smallest_score['score']:
        smallest_score['feature'] = col
        smallest_score['score'] = abs(score_difference)
    print(f"Feat: {col}  Score Difference: {score_difference}")

print("\n")
print(smallest_score)

Feat: lead_source  Score Difference: 0.0034129692832765013
Feat: industry  Score Difference: 0.0
Feat: employment_status  Score Difference: -0.0034129692832763903
Feat: location  Score Difference: 0.010238907849829393
Feat: number_of_courses_viewed  Score Difference: -0.14334470989761094
Feat: annual_income  Score Difference: 0.15358361774744034
Feat: interaction_count  Score Difference: -0.14334470989761094
Feat: lead_score  Score Difference: 0.0068259385665528916


{'feature': 'industry', 'score': 0.0}


### 6.

In [71]:
for c in [0.01, 0.1, 1, 10, 100]:
    score = exlude_featur_split_train(col,C=c)
    print(f"C: {c:<5} Accurracy: {round(score,3)}")   

C: 0.01  Accurracy: 0.696
C: 0.1   Accurracy: 0.7
C: 1     Accurracy: 0.706
C: 10    Accurracy: 0.706
C: 100   Accurracy: 0.706
