### Imports and Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

All libraries imported successfully!


### Load and Explore Dataset

In [2]:
# Load the lead scoring dataset
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 5 rows:")
df.head()

Dataset shape: (1462, 9)
Columns: ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score', 'converted']

First 5 rows:


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


### Dataset Information

In [3]:
print("Dataset info:")
df.info()

print("\nTarget variable distribution:")
print(df['converted'].value_counts())
print(f"Conversion rate: {df['converted'].mean():.3f}")

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB

Target variable distribution:
converted
1    905
0    557
Name: count, dtype: int64
Conversion rate: 0.619


### Check for Missing Values

In [4]:
# Check for missing values
missing_counts = df.isnull().sum()
print("Missing values per column:")
for col, count in missing_counts.items():
    if count > 0:
        print(f"{col}: {count} ({count/len(df)*100:.1f}%)")

if missing_counts.sum() == 0:
    print("\nNo missing values found in this dataset.")
else:
    print(f"\nTotal missing values: {missing_counts.sum()}")

Missing values per column:
lead_source: 128 (8.8%)
industry: 134 (9.2%)
annual_income: 181 (12.4%)
employment_status: 100 (6.8%)
location: 63 (4.3%)

Total missing values: 606


###  Identify Feature Types

In [5]:
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target from feature lists
if 'converted' in categorical_cols:
    categorical_cols.remove('converted')
if 'converted' in numerical_cols:
    numerical_cols.remove('converted')

print(f"Categorical features ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical features ({len(numerical_cols)}): {numerical_cols}")

Categorical features (4): ['lead_source', 'industry', 'employment_status', 'location']
Numerical features (4): ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


### Data Cleaning (Missing Value Treatment)

In [6]:
# Apply missing value treatment
df_clean = df.copy()

# For categorical: fill with 'NA'
for col in categorical_cols:
    df_clean[col] = df_clean[col].fillna('NA')
    
# For numerical: fill with 0.0  
for col in numerical_cols:
    df_clean[col] = df_clean[col].fillna(0.0)

print("Missing values after treatment:", df_clean.isnull().sum().sum())
print("Data preparation complete!")

Missing values after treatment: 0
Data preparation complete!


### Most Frequent Industry

In [7]:
# Find the mode (most frequent value) for the industry column
print("Industry column value counts:")
industry_counts = df_clean['industry'].value_counts()
print(industry_counts)

mode_industry = df_clean['industry'].mode()[0]
print(f"\nMost frequent observation (mode): {mode_industry}")

print(f"\n✓ Q1 Answer: {mode_industry}")

Industry column value counts:
industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

Most frequent observation (mode): retail

✓ Q1 Answer: retail


### Correlation Matrix Analysis

In [8]:
# Create correlation matrix for numerical features only
numerical_df = df_clean[numerical_cols]
correlation_matrix = numerical_df.corr()

print("Correlation matrix for numerical features:")
print(correlation_matrix.round(3))

Correlation matrix for numerical features:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                     1.000          0.010   
annual_income                                0.010          1.000   
interaction_count                           -0.024          0.027   
lead_score                                  -0.005          0.016   

                          interaction_count  lead_score  
number_of_courses_viewed             -0.024      -0.005  
annual_income                         0.027       0.016  
interaction_count                     1.000       0.010  
lead_score                            0.010       1.000  


### Check Specific Correlation Pairs

In [9]:
# Check the specific pairs mentioned in the question
pairs_to_check = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

print("Checking correlation for specified pairs:")
correlations = {}

for col1, col2 in pairs_to_check:
    if col1 in correlation_matrix.columns and col2 in correlation_matrix.columns:
        corr_value = correlation_matrix.loc[col1, col2]
        abs_corr = abs(corr_value)
        correlations[(col1, col2)] = abs_corr
        print(f"{col1} vs {col2}: {corr_value:.4f} (absolute: {abs_corr:.4f})")

if correlations:
    # Find pair with highest absolute correlation
    max_corr_pair = max(correlations.items(), key=lambda x: x[1])
    pair_names = max_corr_pair[0]
    corr_strength = max_corr_pair[1]
    
    print(f"\nHighest correlation pair: {pair_names[0]} and {pair_names[1]}")
    print(f"Absolute correlation: {corr_strength:.4f}")
    print(f"\n✓ Q2 Answer: {pair_names[0]} and {pair_names[1]}")

Checking correlation for specified pairs:
interaction_count vs lead_score: 0.0099 (absolute: 0.0099)
number_of_courses_viewed vs lead_score: -0.0049 (absolute: 0.0049)
number_of_courses_viewed vs interaction_count: -0.0236 (absolute: 0.0236)
annual_income vs interaction_count: 0.0270 (absolute: 0.0270)

Highest correlation pair: annual_income and interaction_count
Absolute correlation: 0.0270

✓ Q2 Answer: annual_income and interaction_count


### Data Splitting

In [10]:
# Prepare features and target
X = df_clean.drop('converted', axis=1)
y = df_clean['converted']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split: first 80%/20%, then split the 80% into 60%/20% of total
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Now split the remaining 80% into 60% train and 20% val
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp  # 0.25 * 0.8 = 0.2 of total
)

print(f"Training set: {len(X_train)} samples ({len(X_train)/len(X):.1%})")
print(f"Validation set: {len(X_val)} samples ({len(X_val)/len(X):.1%})")
print(f"Test set: {len(X_test)} samples ({len(X_test)/len(X):.1%})")

print(f"\nClass distribution maintained:")
print(f"Train: {y_train.mean():.3f}, Val: {y_val.mean():.3f}, Test: {y_test.mean():.3f}")

Features shape: (1462, 8)
Target shape: (1462,)
Training set: 876 samples (59.9%)
Validation set: 293 samples (20.0%)
Test set: 293 samples (20.0%)

Class distribution maintained:
Train: 0.620, Val: 0.618, Test: 0.618


### Mutual Information Analysis

In [11]:
# Calculate mutual information for categorical variables using training data only
categorical_features = [col for col in categorical_cols if col in X_train.columns]
print(f"Categorical features to analyze: {categorical_features}")

mutual_info_scores = {}

for col in categorical_features:
    # Encode categorical feature for mutual_info_classif
    le = LabelEncoder()
    X_encoded = le.fit_transform(X_train[col].astype(str))
    
    # Calculate mutual information
    mi_score = mutual_info_classif(
        X_encoded.reshape(-1, 1), 
        y_train, 
        random_state=42
    )[0]
    
    mutual_info_scores[col] = round(mi_score, 2)
    print(f"{col}: {mi_score:.6f} (rounded: {round(mi_score, 2)})")

# Find variable with highest MI score
max_mi_var = max(mutual_info_scores.items(), key=lambda x: x[1])
print(f"\nHighest mutual information: {max_mi_var[0]} (score: {max_mi_var[1]})")

# Check against options: industry, location, lead_source, employment_status
options = ['industry', 'location', 'lead_source', 'employment_status']
print(f"\nMI scores for question options:")
for opt in options:
    if opt in mutual_info_scores:
        print(f"{opt}: {mutual_info_scores[opt]}")

print(f"\n✓ Q3 Answer: {max_mi_var[0]}")

Categorical features to analyze: ['lead_source', 'industry', 'employment_status', 'location']
lead_source: 0.002672 (rounded: 0.0)
industry: 0.000707 (rounded: 0.0)
employment_status: 0.010330 (rounded: 0.01)
location: 0.008639 (rounded: 0.01)

Highest mutual information: employment_status (score: 0.01)

MI scores for question options:
industry: 0.0
location: 0.01
lead_source: 0.0
employment_status: 0.01

✓ Q3 Answer: employment_status


### Prepare Data for Logistic Regression

In [12]:
# Separate numerical and categorical features for encoding
X_train_num = X_train[numerical_cols]
X_train_cat = X_train[categorical_features]

X_val_num = X_val[numerical_cols]
X_val_cat = X_val[categorical_features]

print(f"Numerical features: {X_train_num.shape[1]}")
print(f"Categorical features: {X_train_cat.shape[1]}")

# Apply one-hot encoding to categorical features
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit on training data and transform both sets
X_train_cat_encoded = ohe.fit_transform(X_train_cat)
X_val_cat_encoded = ohe.transform(X_val_cat)

print(f"Encoded categorical features: {X_train_cat_encoded.shape[1]} features")

# Combine numerical and encoded categorical features
X_train_final = np.hstack([X_train_num.values, X_train_cat_encoded])
X_val_final = np.hstack([X_val_num.values, X_val_cat_encoded])

print(f"Final training features shape: {X_train_final.shape}")
print(f"Final validation features shape: {X_val_final.shape}")

Numerical features: 4
Categorical features: 4
Encoded categorical features: 27 features
Final training features shape: (876, 31)
Final validation features shape: (293, 31)


### Train Logistic Regression

In [13]:
# Train logistic regression with specified parameters
model = LogisticRegression(
    solver='liblinear', 
    C=1.0, 
    max_iter=1000, 
    random_state=42
)

model.fit(X_train_final, y_train)
print("Model trained successfully!")

# Calculate accuracy on validation set
y_val_pred = model.predict(X_val_final)
accuracy = accuracy_score(y_val, y_val_pred)

print(f"\nValidation accuracy: {accuracy:.6f}")
print(f"Rounded to 2 decimals: {round(accuracy, 2)}")

print(f"\n✓ Q4 Answer: {round(accuracy, 2)}")

# Store for later use
original_accuracy = accuracy

Model trained successfully!

Validation accuracy: 0.730375
Rounded to 2 decimals: 0.73

✓ Q4 Answer: 0.73


### Feature Elimination Analysis

In [14]:
print(f"Original accuracy (for reference): {original_accuracy:.6f}")

# Features to test for elimination
features_to_test = ['industry', 'employment_status', 'lead_score']
feature_differences = {}

print("\nTesting feature elimination:")

for feature_name in features_to_test:
    print(f"\n--- Testing removal of '{feature_name}' ---")
    
    if feature_name in numerical_cols:
        # Remove numerical feature
        remaining_num_cols = [col for col in numerical_cols if col != feature_name]
        X_train_reduced_num = X_train[remaining_num_cols]
        X_val_reduced_num = X_val[remaining_num_cols]
        
        # Keep all categorical features (re-encode)
        X_train_reduced = np.hstack([X_train_reduced_num.values, X_train_cat_encoded])
        X_val_reduced = np.hstack([X_val_reduced_num.values, X_val_cat_encoded])
        
        print(f"Removed numerical feature '{feature_name}'")
        
    elif feature_name in categorical_features:
        # Remove categorical feature
        remaining_cat_cols = [col for col in categorical_features if col != feature_name]
        X_train_cat_reduced = X_train[remaining_cat_cols]
        X_val_cat_reduced = X_val[remaining_cat_cols]
        
        # Re-encode without the removed feature
        ohe_reduced = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        X_train_cat_reduced_encoded = ohe_reduced.fit_transform(X_train_cat_reduced)
        X_val_cat_reduced_encoded = ohe_reduced.transform(X_val_cat_reduced)
        
        # Keep all numerical features
        X_train_reduced = np.hstack([X_train_num.values, X_train_cat_reduced_encoded])
        X_val_reduced = np.hstack([X_val_num.values, X_val_cat_reduced_encoded])
        
        print(f"Removed categorical feature '{feature_name}'")
        
    else:
        print(f"Feature '{feature_name}' not found in either numerical or categorical features")
        continue
    
    print(f"Reduced features shape: {X_train_reduced.shape}")
    
    # Train model without this feature
    model_reduced = LogisticRegression(
        solver='liblinear', 
        C=1.0, 
        max_iter=1000, 
        random_state=42
    )
    
    model_reduced.fit(X_train_reduced, y_train)
    y_val_pred_reduced = model_reduced.predict(X_val_reduced)
    accuracy_reduced = accuracy_score(y_val, y_val_pred_reduced)
    
    # Calculate difference (original - reduced)
    difference = original_accuracy - accuracy_reduced
    feature_differences[feature_name] = difference
    
    print(f"Accuracy without '{feature_name}': {accuracy_reduced:.6f}")
    print(f"Difference: {difference:.6f}")

Original accuracy (for reference): 0.730375

Testing feature elimination:

--- Testing removal of 'industry' ---
Removed categorical feature 'industry'
Reduced features shape: (876, 23)
Accuracy without 'industry': 0.730375
Difference: 0.000000

--- Testing removal of 'employment_status' ---
Removed categorical feature 'employment_status'
Reduced features shape: (876, 26)
Accuracy without 'employment_status': 0.733788
Difference: -0.003413

--- Testing removal of 'lead_score' ---
Removed numerical feature 'lead_score'
Reduced features shape: (876, 30)
Accuracy without 'lead_score': 0.730375
Difference: 0.000000


### Find Feature with Smallest Difference

In [15]:
# Summary and find feature with smallest difference
print("\n=== Feature Elimination Summary ===")
for feature, diff in feature_differences.items():
    print(f"{feature}: difference = {diff:.6f}")

# Find feature with smallest absolute difference
min_diff_feature = min(feature_differences.items(), key=lambda x: abs(x[1]))

print(f"\nFeature with smallest difference: {min_diff_feature[0]}")
print(f"Difference: {min_diff_feature[1]:.6f}")

print(f"\n✓ Q5 Answer: {min_diff_feature[0]}")


=== Feature Elimination Summary ===
industry: difference = 0.000000
employment_status: difference = -0.003413
lead_score: difference = 0.000000

Feature with smallest difference: industry
Difference: 0.000000

✓ Q5 Answer: industry


### Regularized Logistic Regression

In [16]:
# Test different regularization parameters
C_values = [0.01, 0.1, 1, 10, 100]
C_accuracies = {}

print("Testing different C values for regularization:")
print("(Lower C = more regularization)")

for C in C_values:
    # Train model with different C value
    model_reg = LogisticRegression(
        solver='liblinear', 
        C=C, 
        max_iter=1000, 
        random_state=42
    )
    
    model_reg.fit(X_train_final, y_train)
    y_val_pred_reg = model_reg.predict(X_val_final)
    accuracy_reg = accuracy_score(y_val, y_val_pred_reg)
    
    C_accuracies[C] = accuracy_reg
    print(f"C={C}: accuracy = {accuracy_reg:.6f} (rounded to 3 decimals: {round(accuracy_reg, 3)})")

Testing different C values for regularization:
(Lower C = more regularization)
C=0.01: accuracy = 0.733788 (rounded to 3 decimals: 0.734)
C=0.1: accuracy = 0.730375 (rounded to 3 decimals: 0.73)
C=1: accuracy = 0.730375 (rounded to 3 decimals: 0.73)
C=10: accuracy = 0.730375 (rounded to 3 decimals: 0.73)
C=100: accuracy = 0.730375 (rounded to 3 decimals: 0.73)


### Find Best C Value

In [17]:
# Find best C value
print("\n=== Regularization Results ===")
for C, acc in C_accuracies.items():
    print(f"C={C}: {round(acc, 3)}")

# Find C with highest accuracy (if tie, select smallest C)
max_accuracy = max(C_accuracies.values())
best_C_candidates = [C for C, acc in C_accuracies.items() if acc == max_accuracy]
best_C = min(best_C_candidates)  # Select smallest C in case of tie

print(f"\nBest accuracy: {round(max_accuracy, 3)}")
print(f"Best C value: {best_C}")

if len(best_C_candidates) > 1:
    print(f"Note: Multiple C values had the same accuracy, selected smallest: {best_C}")

print(f"\n✓ Q6 Answer: {best_C}")


=== Regularization Results ===
C=0.01: 0.734
C=0.1: 0.73
C=1: 0.73
C=10: 0.73
C=100: 0.73

Best accuracy: 0.734
Best C value: 0.01

✓ Q6 Answer: 0.01
