In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, matthews_corrcoef, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PowerTransformer, StandardScaler
import joblib
from scipy.stats import randint

# Load the dataset
df = pd.read_csv('final_dataset.tsv', sep='\t')
feature_cols = ['mH2', 'mHD', 'mAD', 'mHDp', 'alpha', 'L2', 'L8', 'vs', 'm22sq']
label_cols = ['valid_BFB', 'valid_Uni', 'valid_STU', 'valid_Higgs']

X = df[feature_cols].copy()
y = df[label_cols]

# Preprocess the data
pt = PowerTransformer(method='yeo-johnson')
X = pt.fit_transform(X)

scaler = StandardScaler()
X = scaler.fit_transform(X)

# Save the transformers
output_dir = 'baseline'
os.makedirs(output_dir, exist_ok=True)
joblib.dump(pt, os.path.join(output_dir, 'power_transformer.pkl'))
joblib.dump(scaler, os.path.join(output_dir, 'scaler.pkl'))

# Step 1: Split into 70% training and 30% validation+test
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 2: Split the 30% into 15% validation and 15% test
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Define the hyperparameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(100, 500),  # Randomly choose n_estimators between 100 and 1000
    'max_depth': [None, 10, 30],  # Randomly choose None or a value from the list
    'min_samples_split': randint(2, 20),  # Randomly choose between 2 and 20 for min_samples_split
    'min_samples_leaf': randint(1, 10),  # Randomly choose between 1 and 10 for min_samples_leaf
    'max_features': ['sqrt', 'log2', None],  # Randomly choose max features
    'criterion': ['gini', 'entropy'],  # Gini or entropy criterion
    'bootstrap': [True, False]  # Test with and without bootstrap
}

# Create the base Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Perform random search over the parameters
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, 
                                   n_iter=10,  # Controls the number of random combinations tested
                                   cv=3, scoring='accuracy', n_jobs=-1, random_state=42)

# Fit the randomized search model using the training data
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Refit the model using the best parameters on the combined training + validation data
best_rf_model = random_search.best_estimator_

# Use the test set to evaluate the final model
y_pred = best_rf_model.predict(X_test)

# Compute subset accuracy on the test set
correct_rows = np.all(y_pred == y_test.values, axis=1)
subset_accuracy = np.mean(correct_rows)

# Print the subset accuracy
print(f"Subset Accuracy on Test Set: {subset_accuracy}")

# Compute and print additional metrics
hamming = hamming_loss(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')
mcc = matthews_corrcoef(y_test.values.ravel(), y_pred.ravel())  # Flattened for multi-label
report = classification_report(y_test, y_pred, target_names=label_cols, zero_division=0)

print(f"Hamming Loss: {hamming}")
print(f"Macro F1 Score: {f1}")
print(f"Matthews Correlation Coefficient: {mcc}")
print(f"Classification Report:\n{report}")

# MCC per label
mcc_per_label = [matthews_corrcoef(y_test[col], y_pred[:, i]) for i, col in enumerate(label_cols)]
print(f"MCC per label: {mcc_per_label}")

# Save the model and results
model_path = os.path.join(output_dir, 'best_random_forest_model.pkl')
joblib.dump(best_rf_model, model_path)

results_path = os.path.join(output_dir, 'best_model_results.txt')
with open(results_path, 'w') as f:
    f.write(f"Best Hyperparameters: {best_params}\n")
    f.write(f"Subset Accuracy: {subset_accuracy}\n")
    f.write(f"Hamming Loss: {hamming}\n")
    f.write(f"Macro F1 Score: {f1}\n")
    f.write(f"Matthews Correlation Coefficient: {mcc}\n")
    f.write(f"MCC per label: {mcc_per_label}\n")
    f.write(f"Classification Report:\n{report}\n")

# Save the processed test set predictions
predictions_df = pd.DataFrame(y_pred, columns=label_cols)
predictions_df.to_csv(os.path.join(output_dir, 'best_test_predictions.csv'), index=False)

# Optionally save the processed test features
X_test_df = pd.DataFrame(X_test, columns=feature_cols)
X_test_df.to_csv(os.path.join(output_dir, 'processed_test_features.csv'), index=False)
