Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from itertools import product
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_colwidth', None)  # Show full content in cells
# Load the data
train_path = '/Users/yingyunqian/Desktop/BT4240 ML/train_data.csv'

train_df = pd.read_csv(train_path)

# Prepare features and target
X = train_df.drop(columns=['Target'])
y = train_df['Target']

# Extended hyperparameter search space
from itertools import product
import random

max_depth = [3,4, 5,6, 7,8,9, 10,11, 12, None]
min_samples_split = [2, 5, 10, 20,25,30,35,40]
min_samples_leaf = [1, 2, 3,4, 5,6,7,8,10,11,12]
criterion = ['gini', 'entropy']
max_features = [None, 'sqrt', 'log2']
splitter = ['best', 'random']

# Generate all combinations
param_grid = list(product(max_depth, min_samples_split, min_samples_leaf, criterion, max_features, splitter))

# Shuffle and take 50 random unique combinations
random.seed(42)
random.shuffle(param_grid)
param_grid = param_grid[:13000]  # Experiment with 50 models instead of 20


# New suggested parameter sets
new_param_suggestions = [
    {'max_depth': 5, 'min_samples_split': 20, 'min_samples_leaf': 5, 'criterion': 'entropy', 'max_features': None, 'splitter': 'best'},

    # Add more custom combinations as you like
]


# Convert to tuples (since your loop uses tuples from itertools.product)
new_param_grid = [
    (
        d['max_depth'],
        d['min_samples_split'],
        d['min_samples_leaf'],
        d['criterion'],
        d['max_features'],
        d['splitter']
    )
    for d in new_param_suggestions
]

# Append to existing param_grid
param_grid.extend(new_param_grid)
# Append to existing param_grid
param_grid.extend(new_param_grid)
results = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Update model loop to use the new parameters
for i, (depth, min_split, min_leaf, crit, max_feat, split) in enumerate(param_grid):
    accuracies, precisions, recalls, f1s = [], [], [], []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        clf = DecisionTreeClassifier(
            max_depth=depth,
            min_samples_split=min_split,
            min_samples_leaf=min_leaf,
            criterion=crit,
            max_features=max_feat,
            splitter=split,
            random_state=42
        )

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)

        accuracies.append(accuracy_score(y_val, y_pred))
        precisions.append(precision_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
        f1s.append(f1_score(y_val, y_pred))

    results.append({
        'params': {
            'max_depth': depth,
            'min_samples_split': min_split,
            'min_samples_leaf': min_leaf,
            'criterion': crit,
            'max_features': max_feat,
            'splitter': split
        },
        'accuracy': np.mean(accuracies),
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'f1_score': np.mean(f1s)
    })

# Convert results to DataFrame for easy sorting
results_df = pd.DataFrame(results)

# Sort by F1-score to find best performing models
top_5_models = results_df.sort_values(by='f1_score', ascending=False).head(5)

print("Top 5 Decision Tree Models Based on F1 Score:\n")
print(top_5_models[['params', 'accuracy', 'precision', 'recall', 'f1_score']])
from sklearn.linear_model import LogisticRegression

# Logistic Regression Baseline
log_acc = []
log_prec = []
log_rec = []
log_f1 = []

log_model = LogisticRegression(max_iter=1000, random_state=42)

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    log_model.fit(X_train, y_train)
    y_pred = log_model.predict(X_val)

    log_acc.append(accuracy_score(y_val, y_pred))
    log_prec.append(precision_score(y_val, y_pred))
    log_rec.append(recall_score(y_val, y_pred))
    log_f1.append(f1_score(y_val, y_pred))

print("📊 Logistic Regression Baseline Performance (5-Fold CV):")
print(f"Accuracy:  {np.mean(log_acc):.4f}")
print(f"Precision: {np.mean(log_prec):.4f}")
print(f"Recall:    {np.mean(log_rec):.4f}")
print(f"F1 Score:  {np.mean(log_f1):.4f}")

Fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Accuracy: 0.8484
Precision: 0.7975
Recall: 0.6702
F1 Score: 0.7283
----------------------------------------
Fold 2


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Accuracy: 0.8708
Precision: 0.8973
Recall: 0.7313
F1 Score: 0.8058
----------------------------------------
Fold 3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Accuracy: 0.8368
Precision: 0.7565
Recall: 0.7300
F1 Score: 0.7430
----------------------------------------
Fold 4


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Accuracy: 0.8611
Precision: 0.7660
Recall: 0.7742
F1 Score: 0.7701
----------------------------------------
Fold 5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Accuracy: 0.8498
Precision: 0.8571
Recall: 0.6218
F1 Score: 0.7207
----------------------------------------
Average metrics across all folds:
Accuracy: 0.8534
Precision: 0.8149
Recall: 0.7055
F1 Score: 0.7536
