In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#Load Data in PD DataFrame
df = pd.read_csv('course_lead_scoring.txt', sep=',')

#define target and features
target = 'converted'
y = df[target]
X = df.drop(columns=[target])

#Identify categorical and numerical columns
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()

for col in categorical_cols:
    X[col] = X[col].fillna('NA')

# numerical features with 0.0
for col in numerical_cols:
    X[col] = X[col].fillna(0.0)

print("--- Data Preparation Stage now Complete ---")
print(f"Total missing values after imputation: {X.isnull().sum().sum()}")

#Question 1: Mode for industry
print("\n--- Question 1: Mode for 'industry' ---")
# The mode is calculated after filling NAs, as 'NA' itself can be the mode.
mode_industry = X['industry'].mode()[0]
print(f"The most frequent observation (mode) for 'industry' is: {mode_industry}")

#Question 2: Correllation Matrix
print("\n--- Question 2: Correlation Matrix ---")
correlation_matrix = X[numerical_cols].corr()

pairs_to_check = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

max_corr = -1
max_pair = None

for f1, f2 in pairs_to_check:
    corr = correlation_matrix.loc[f1, f2]
    if abs(corr) > max_corr:
        max_corr = abs(corr)
        max_pair = (f1, f2)

print(f"The two features with the biggest absolute correlation are: {max_pair[0]} and {max_pair[1]} ({max_corr:.4f})")

# --- Data split (60%/20%/20%, Seed 42) ---
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"\n--- Data Split (Seed 42) ---")
print(f"Train size: {len(X_train)} | Val size: {len(X_val)} | Test size: {len(X_test)}")

# --- 5. Question 3: Mutual Information Score ---
print("\n--- Question 3: Mutual Information Score ---")

def calculate_mutual_info(X_cat, y):
    # Mutual Info requires converting string categories to integers
    X_encoded = pd.DataFrame()
    for col in X_cat.columns:
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_cat[col])
    
    mi_scores = mutual_info_classif(X_encoded, y, random_state=42)
    scores = pd.Series(mi_scores, index=X_cat.columns)
    return scores

mi_scores_train = calculate_mutual_info(X_train[categorical_cols], y_train)
mi_scores_train_rounded = round(mi_scores_train, 2).sort_values(ascending=False)

print("Mutual Information Scores (Rounded to 2 decimals):")
print(mi_scores_train_rounded)
print(f"Variable with the biggest mutual information score: {mi_scores_train_rounded.index[0]}")


# --- 6. Preparation for Logistic Regression (One-Hot Encoding) ---

# One-hot encode categorical features for all sets
X_train_processed = pd.get_dummies(X_train, columns=categorical_cols, drop_first=False)
X_val_processed = pd.get_dummies(X_val, columns=categorical_cols, drop_first=False)
X_test_processed = pd.get_dummies(X_test, columns=categorical_cols, drop_first=False)

# Align columns to ensure consistency across sets (important for model input)
common_cols = list(set(X_train_processed.columns) & set(X_val_processed.columns) & set(X_test_processed.columns))
X_train_processed = X_train_processed[common_cols]
X_val_processed = X_val_processed[common_cols]
X_test_processed = X_test_processed[common_cols]

# Re-check columns that were not common and add them as 0s if missing in other sets
train_cols = X_train_processed.columns
for df_set in [X_val_processed, X_test_processed]:
    for col in train_cols:
        if col not in df_set.columns:
            df_set[col] = 0

# Final alignment
X_val_processed = X_val_processed[train_cols]
X_test_processed = X_test_processed[train_cols]

# --- 7. Question 4: Base Logistic Regression Accuracy (C=1.0) ---
print("\n--- Question 4: Base Logistic Regression Accuracy ---")

model_base = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_base.fit(X_train_processed, y_train)
y_pred_val_base = model_base.predict(X_val_processed)
accuracy_base = accuracy_score(y_val, y_pred_val_base)
accuracy_base_rounded = round(accuracy_base, 2)

print(f"Accuracy on validation dataset: {accuracy_base:.4f}")
print(f"Accuracy rounded to 2 decimals: {accuracy_base_rounded}")


# ---Question 5:
print("\n--- Question 5: Feature Elimination ---")

features_to_eliminate = ['industry', 'employment_status', 'lead_score']
accuracy_original = accuracy_base # Use unrounded base accuracy
difference_scores = {}

for feature in features_to_eliminate:
    # 1. Identify columns related to the feature (OHE categories or numerical)
    if feature in numerical_cols:
        cols_to_drop = [feature]
    else: # Categorical feature (drop all OHE columns starting with the name)
        cols_to_drop = [col for col in X_train_processed.columns if col.startswith(f'{feature}_')]

    # 2. Drop features and train new model
    X_train_elim = X_train_processed.drop(columns=cols_to_drop)
    X_val_elim = X_val_processed.drop(columns=cols_to_drop)

    model_elim = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_elim.fit(X_train_elim, y_train)

    # 3. Predict and calculate accuracy
    accuracy_elim = accuracy_score(y_val, model_elim.predict(X_val_elim))

    # 4. Calculate difference: Original Accuracy - New Accuracy
    difference = accuracy_original - accuracy_elim
    difference_scores[feature] = difference

    print(f"  Accuracy without '{feature}': {accuracy_elim:.4f} | Difference: {difference:.5f}")

# Find the feature with the smallest difference (closest to 0, potentially negative)
smallest_diff_feature = min(difference_scores, key=lambda k: difference_scores[k])
smallest_diff = difference_scores[smallest_diff_feature]

print(f"Feature with the smallest difference: '{smallest_diff_feature}'")
print(f"Smallest difference value: {smallest_diff:.5f}")


# Question 6: Regularized Logistic Regression (C Tunning) ---
print("\n--- Question 6: Regularized Logistic Regression (C Tunning) ---")

C_values = [0.01, 0.1, 1, 10, 100]
accuracy_scores = {}

for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train_processed, y_train)
    y_pred_val_reg = model_reg.predict(X_val_processed)
    accuracy_reg = accuracy_score(y_val, y_pred_val_reg)
    accuracy_scores[C] = round(accuracy_reg, 3)

print("Accuracy scores for different C values (rounded to 3 decimals):")
print(accuracy_scores)

# Find the best C (highest accuracy, smallest C in case of tie)
best_accuracy = max(accuracy_scores.values())
best_C_candidates = [C for C, acc in accuracy_scores.items() if acc == best_accuracy]
best_C = min(best_C_candidates)

print(f"Best C (smallest C for best accuracy): {best_C}")


--- Data Preparation Stage now Complete ---
Total missing values after imputation: 0

--- Question 1: Mode for 'industry' ---
The most frequent observation (mode) for 'industry' is: retail

--- Question 2: Correlation Matrix ---
The two features with the biggest absolute correlation are: annual_income and interaction_count (0.0270)

--- Data Split (Seed 42) ---
Train size: 877 | Val size: 292 | Test size: 293

--- Question 3: Mutual Information Score ---
Mutual Information Scores (Rounded to 2 decimals):
lead_source          0.04
industry             0.03
employment_status    0.02
location             0.02
dtype: float64
Variable with the biggest mutual information score: lead_source

--- Question 4: Base Logistic Regression Accuracy ---
Accuracy on validation dataset: 0.7432
Accuracy rounded to 2 decimals: 0.74

--- Question 5: Feature Elimination ---
  Accuracy without 'industry': 0.7432 | Difference: 0.00000
  Accuracy without 'employment_status': 0.7466 | Difference: -0.00342
  Acc