In [94]:
import numpy as np
import pandas as pd

In [95]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [96]:
# Data preparation - check for missing values
print("Dataset shape:", df.shape)
print("\nDataset info:")
df.info()
print("\nMissing values per column:")
print(df.isnull().sum())

Dataset shape: (1462, 9)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB

Missing values per column:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           1

In [97]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)


for col in categorical_cols:
    df[col] = df[col].fillna('NA')

for col in numerical_cols:
    df[col] = df[col].fillna(0.0)

Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']


## Question 1
What is the most frequent observation (mode) for the column industry?

- NA
- technology
- healthcare
- retail

In [98]:
industry_mode = df['industry'].mode()[0]
print(f"Most frequent observation (mode) for 'industry' column: {industry_mode}")
print(f"\nValue counts for 'industry':")
print(df['industry'].value_counts())

Most frequent observation (mode) for 'industry' column: retail

Value counts for 'industry':
industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64


## Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

interaction_count and lead_score
number_of_courses_viewed and lead_score
number_of_courses_viewed and interaction_count
annual_income and interaction_count
Only consider the pairs above when answering this question.

Split the data
Split your data in train/val/test sets with 60%/20%/20% distribution.
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
Make sure that the target value converted is not in your dataframe.

In [99]:
numerical_features = [col for col in numerical_cols if col != 'converted']
print("Numerical features:", numerical_features)

correlation_matrix = df[numerical_features].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'), 
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

print("\nCorrelations for specified pairs:")
for pair in pairs:
    if pair[0] in correlation_matrix.columns and pair[1] in correlation_matrix.columns:
        corr_value = correlation_matrix.loc[pair[0], pair[1]]
        print(f"{pair[0]} and {pair[1]}: {corr_value:.4f}")

Numerical features: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

Correlation Matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  

Correlations for specified pairs:
interaction_count and lead_score: 0.0099
number_of_courses_viewed and lead_score: -0.0049
number_of_courses_viewed and interaction_count: -0.0236
annual_income and interaction_count: 0.0270


In [100]:
# Split the data
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop('converted', axis=1)
y = df['converted']

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("Target distribution:", y.value_counts())

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"\nTrain set: {X_train.shape[0]} samples ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(df)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(df)*100:.1f}%)")

Features shape: (1462, 8)
Target shape: (1462,)
Target distribution: converted
1    905
0    557
Name: count, dtype: int64

Train set: 877 samples (60.0%)
Validation set: 292 samples (20.0%)
Test set: 293 samples (20.0%)


## Question 3
Calculate the mutual information score between converted and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the biggest mutual information score?

industry
location
lead_source
employment_status

In [101]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder


categorical_features = [col for col in categorical_cols if col != 'converted']

X_train_encoded = pd.DataFrame()
for col in categorical_features:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train[col].astype(str))

mi_scores = mutual_info_classif(X_train_encoded, y_train, random_state=42)

question_features = ['industry', 'location', 'lead_source', 'employment_status']
print("Mutual Information scores:")
for i, feature in enumerate(categorical_features):
    if feature in question_features:
        score = round(mi_scores[i], 2)
        print(f"{feature}: {score}")

Mutual Information scores:
lead_source: 0.03
industry: 0.0
employment_status: 0.0
location: 0.0


## Question 4
Now let's train a logistic regression.
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
Fit the model on the training dataset.
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
What accuracy did you get?

0.64
0.74
0.84
0.94

In [102]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

categorical_features = [col for col in categorical_cols if col != 'converted']
numerical_features = [col for col in numerical_cols if col != 'converted']

print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
])

model.fit(X_train, y_train)


y_val_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_val_pred)
accuracy_rounded = round(accuracy, 2)

print(f"Validation accuracy: {accuracy_rounded}")

Categorical features: ['lead_source', 'industry', 'employment_status', 'location']
Numerical features: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
Validation accuracy: 0.68


## Question 5
Let's find the least useful feature using the feature elimination technique.
Train a model using the same features and parameters as in Q4 (without rounding).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?

- 'industry'
- 'employment_status'
- 'lead_score'

In [103]:
original_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Original accuracy: {original_accuracy}")

test_features = ['industry', 'employment_status', 'lead_score']
feature_differences = {}

for feature in test_features:
    # Remove feature from appropriate list
    if feature in categorical_features:
        cat_reduced = [col for col in categorical_features if col != feature]
        preprocessor_reduced = ColumnTransformer([
            ('num', 'passthrough', numerical_features),
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_reduced)
        ])
    else:
        num_reduced = [col for col in numerical_features if col != feature]
        preprocessor_reduced = ColumnTransformer([
            ('num', 'passthrough', num_reduced),
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
        ])
    
    # Train model without this feature
    model_reduced = Pipeline([
        ('preprocessor', preprocessor_reduced),
        ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])
    
    model_reduced.fit(X_train, y_train)
    accuracy_reduced = accuracy_score(y_val, model_reduced.predict(X_val))
    difference = original_accuracy - accuracy_reduced
    feature_differences[feature] = difference
    
min_feature = min(feature_differences, key=feature_differences.get)
print(f"\nSmallest difference: {min_feature}")

Original accuracy: 0.684931506849315

Smallest difference: industry


## Question 6
Now let's train a regularized logistic regression.
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
Train models using all the features as in Q4.
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
Which of these C leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100


In [104]:
# Question 6: Regularized logistic regression with different C values
c_values = [0.01, 0.1, 1, 10, 100]
results = {}

for c in c_values:
    model_c= Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42))
    ])
    

    model_c.fit(X_train, y_train)
    y_val_pred_c = model_c.predict(X_val)
    accuracy_c = accuracy_score(y_val, y_val_pred_c)
    results[c] = round(accuracy_c, 3)
    
    print(f"C = {c}: Accuracy = {results[c]}")

best_c = max(results, key=results.get)
print(f"\nBest C value: {best_c} with accuracy: {results[best_c]}")

C = 0.01: Accuracy = 0.685
C = 0.1: Accuracy = 0.685
C = 1: Accuracy = 0.685
C = 10: Accuracy = 0.685
C = 100: Accuracy = 0.685

Best C value: 0.01 with accuracy: 0.685
