In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_recall_curve, accuracy_score
from sklearn.feature_extraction import DictVectorizer # New import for OHE
import tabulate

# --- 0. Data Loading and Initial Setup ---

#Load Data in PD DataFrame
df = pd.read_csv('course_lead_scoring.txt', sep=',')


# Define target and features
target = 'converted'
y = df[target]
X = df.drop(columns=[target])

# Identify column types
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()

# --- 1. Data Preparation: Missing Values ---

# Impute categorical features with 'NA'
for col in categorical_cols:
    X[col] = X[col].fillna('NA')

# Impute numerical features with 0.0
for col in numerical_cols:
    X[col] = X[col].fillna(0.0)

print("--- Data Preparation Complete ---")
print(f"Total missing values after imputation: {X.isnull().sum().sum()}")


# --- 2. Data Split (60%/20%/20%, Seed 1) ---
# NOTE: Using random_state=1 as required for this section of analysis.
# Split into Train/Validation/Test (60%/20%/20%)
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Split X_train_full (80%) into X_train (60%) and X_val (20%)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=1) # 0.25 of 0.8 is 0.2

print(f"\n--- Data Split (Seed 1) ---")
print(f"Train size: {len(X_train)} | Validation size: {len(X_val)} | Test size: {len(X_test)}")


# --- 3. Question 1: ROC AUC Feature Importance ---
print("\n--- Question 1: ROC AUC Feature Importance ---")

auc_scores = {}

for col in numerical_cols:
    # Use the numerical column as prediction score on the training set
    y_pred = X_train[col]
    auc = roc_auc_score(y_train, y_pred)
    
    # Invert the score if AUC < 0.5
    if auc < 0.5:
        auc = roc_auc_score(y_train, -y_pred)
    
    auc_scores[col] = auc

best_auc_feature = max(auc_scores, key=auc_scores.get)

print("AUC scores for numerical features (Training Set, inverted if AUC < 0.5):")
for col, auc in auc_scores.items():
    print(f"  {col}: {auc:.4f}")

print(f"The numerical variable with the highest AUC is: {best_auc_feature}")


# --- 4. Question 2: Training the model and Validation AUC ---
print("\n--- Question 2: Training the Model and Validation AUC ---")

# Use DictVectorizer for One-Hot Encoding
dv = DictVectorizer(sparse=False)

# Convert DataFrames to list of dictionaries (required by DictVectorizer)
X_train_dict = X_train.to_dict(orient='records')
X_val_dict = X_val.to_dict(orient='records')

X_train_processed = dv.fit_transform(X_train_dict)
X_val_processed = dv.transform(X_val_dict)

# Train Logistic Regression Model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_processed, y_train)

# Predict probabilities on validation set
y_pred_proba_val = model.predict_proba(X_val_processed)[:, 1]
auc_val = roc_auc_score(y_val, y_pred_proba_val)
auc_val_rounded = round(auc_val, 3)

print(f"AUC of the model on the validation dataset: {auc_val:.4f}")
print(f"AUC rounded to 3 digits: {auc_val_rounded}")


# --- 5. Question 3: Precision and Recall Intersection ---
print("\n--- Question 3: Precision and Recall Intersection ---")

# Compute precision, recall, and thresholds on the validation set
precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba_val)

# Find the index where the difference between P and R is minimal
p_minus_r = np.abs(precision[:-1] - recall[:-1])
intersect_index = np.argmin(p_minus_r)
intersect_threshold = thresholds[intersect_index]

print(f"Threshold where precision and recall curves approximately intersect: {intersect_threshold:.3f}")


# --- 6. Question 4: F1 score ---
print("\n--- Question 4: Max F1 Score ---")

# Calculate F1 score for all thresholds (adding epsilon to prevent division by zero)
epsilon = 1e-7 
F1_scores = 2 * precision * recall / (precision + recall + epsilon)

# Find the index of the maximum F1 score
max_f1_index = np.argmax(F1_scores)
# The threshold array is one element shorter than P, R, and F1_scores, 
# so we use the index that corresponds to the max F1 score (excluding the last element of P/R)
# Note: F1_scores[max_f1_index] corresponds to thresholds[max_f1_index] when P/R arrays are sliced to align with thresholds.
max_f1_threshold = thresholds[max_f1_index]

print(f"Maximum F1 Score: {F1_scores[max_f1_index]:.4f}")
print(f"Threshold where F1 is maximal: {max_f1_threshold:.3f}")


# --- 7. Question 5: 5-Fold CV Standard Deviation (C=1.0) ---
print("\n--- Question 5: 5-Fold CV Standard Deviation (C=1.0) ---")

# Combine train and validation for CV
X_full_train = pd.concat([X_train, X_val])
y_full_train = pd.concat([y_train, y_val])
X_full_train_dict = X_full_train.to_dict(orient='records')

# Define KFold
kf = KFold(n_splits=5, shuffle=True, random_state=1)
cv_auc_scores = []

# Fit DictVectorizer on the entire full training set (train + val)
dv_cv = DictVectorizer(sparse=False)
X_cv_processed = dv_cv.fit_transform(X_full_train_dict)

for train_idx, val_idx in kf.split(X_full_train):
    
    # Split the processed data for CV fold
    X_train_fold, X_val_fold = X_cv_processed[train_idx], X_cv_processed[val_idx]
    y_train_fold, y_val_fold = y_full_train.iloc[train_idx], y_full_train.iloc[val_idx]
    
    # Train model
    model_cv = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_cv.fit(X_train_fold, y_train_fold)
    
    # Evaluate AUC
    y_pred_proba_val_fold = model_cv.predict_proba(X_val_fold)[:, 1]
    auc_fold = roc_auc_score(y_val_fold, y_pred_proba_val_fold)
    cv_auc_scores.append(auc_fold)

std_auc = np.std(cv_auc_scores)
std_auc_rounded = round(std_auc, 4)

print(f"Standard deviation of AUC scores across 5 folds: {std_auc:.6f}")
print(f"Standard deviation rounded to 4 decimals: {std_auc_rounded}")


# --- 8. Question 6: Hyperparameter Tuning (C) ---
print("\n--- Question 6: Hyperparameter Tuning (C) ---")

C_values = [0.000001, 0.001, 1]
results = []

for C in C_values:
    cv_auc_scores = []
    
    # Perform 5-Fold CV for the current C
    for train_idx, val_idx in kf.split(X_full_train):
        
        X_train_fold, X_val_fold = X_cv_processed[train_idx], X_cv_processed[val_idx]
        y_train_fold, y_val_fold = y_full_train.iloc[train_idx], y_full_train.iloc[val_idx]
        
        model_cv = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
        model_cv.fit(X_train_fold, y_train_fold)
        
        y_pred_proba_val_fold = model_cv.predict_proba(X_val_fold)[:, 1]
        auc_fold = roc_auc_score(y_val_fold, y_pred_proba_val_fold)
        cv_auc_scores.append(auc_fold)
        
    mean_auc = np.mean(cv_auc_scores)
    std_auc = np.std(cv_auc_scores)
    
    results.append({
        'C': C,
        'mean_auc': round(mean_auc, 3),
        'std_auc': round(std_auc, 3)
    })

print("CV Results:")
df_results = pd.DataFrame(results)
print(df_results.to_markdown(index=False))

# Find the best C based on criteria
best_c_result = df_results.sort_values(by=['mean_auc', 'std_auc', 'C'], ascending=[False, True, True]).iloc[0]

print(f"\nBest C (Highest Mean AUC, then Lowest STD, then Smallest C): {best_c_result['C']}")


--- Data Preparation Complete ---
Total missing values after imputation: 0

--- Data Split (Seed 1) ---
Train size: 876 | Validation size: 293 | Test size: 293

--- Question 1: ROC AUC Feature Importance ---
AUC scores for numerical features (Training Set, inverted if AUC < 0.5):
  number_of_courses_viewed: 0.7636
  annual_income: 0.5520
  interaction_count: 0.7383
  lead_score: 0.6145
The numerical variable with the highest AUC is: number_of_courses_viewed

--- Question 2: Training the Model and Validation AUC ---
AUC of the model on the validation dataset: 0.8171
AUC rounded to 3 digits: 0.817

--- Question 3: Precision and Recall Intersection ---
Threshold where precision and recall curves approximately intersect: 0.644

--- Question 4: Max F1 Score ---
Maximum F1 Score: 0.8154
Threshold where F1 is maximal: 0.554

--- Question 5: 5-Fold CV Standard Deviation (C=1.0) ---
Standard deviation of AUC scores across 5 folds: 0.018346
Standard deviation rounded to 4 decimals: 0.0183

--- Q

ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.