# Baseline model (Bag of Words approach)

Naive model using a non-contextual model

In [1]:
import sys
import os
import random
import numpy as np

# Get the current directory
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")

# Add the parent directory to the path if needed
parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
sys.path.append(parent_dir)
print(f"Added to path: {parent_dir}")

# set global seeds for reproducability
random.seed(161)
np.random.seed(161)
os.environ['PYTHONHASHSEED'] = '161'

Current working directory: c:\Users\felix\Documents\GIT\Hertie\PACT-ML\modules
Added to path: c:\Users\felix\Documents\GIT\Hertie\PACT-ML


## Data preparation

In [2]:
import pandas as pd
from pyreadr import read_r
from modules.helpers.validity_check import fuzzy_match_report_key

para_data = pd.read_csv("..\data\PACT_paragraphs_training.csv")

report_data = pd.read_csv("..\data\paragraphs.csv", sep=';')

report_data["matchingKey"] = report_data["report_namePKO"].str.replace('/', '_')

# reduced data to 7 target categories with most codings
target_categories = [
    "PoliceReform",
    "Operations_PatrolsInterventions",
    "StateAdministration",
    "RefugeeAssistance",
    "ElectionAssistance",
    "LegalReform",
    "CivilSocietyAssistance"
]

report_data = report_data[["matchingKey", "paragraphNumber"] + target_categories]

report_data[target_categories] = report_data[target_categories].map(lambda x: isinstance(x, str))

# check if a paragraph led to two codings
multi_coded = report_data.copy()
multi_coded["num_labels"] = report_data[target_categories].sum(axis=1)

multi_coded = multi_coded[multi_coded["num_labels"] > 1]

print(f"{len(multi_coded)} paragraphs have multiple codings.")


472 paragraphs have multiple codings.


In [3]:

# number of coded paragraphs in PACT2.0 that have text in the training data
merged = pd.merge(
    report_data[["matchingKey", "paragraphNumber"]],
    para_data[["matchingKey", "paragraphNumber"]],
    on=["matchingKey", "paragraphNumber"],
    how="left",
    indicator=True
)

matching_pairs = (merged["_merge"] == "both").sum()
print(f"Rows matching on both matchingKey and paragraphNumber: {matching_pairs}")

# number of multi-label paragraphs in PACT2.0 that have text in the training data
merged_multi = pd.merge(
    multi_coded[["matchingKey", "paragraphNumber"]],
    para_data[["matchingKey", "paragraphNumber"]],
    on=["matchingKey", "paragraphNumber"],
    how="left",
    indicator=True
)

matching_pairs_multi = (merged_multi["_merge"] == "both").sum()
print(f"Rows matching on both matchingKey and paragraphNumber for multi-coded paragraphs: {matching_pairs_multi}")

# number of paragraphs for which parsing failed
failed_parsing = report_data["matchingKey"].isin(para_data["matchingKey"]).sum() - matching_pairs
print(f"Number of coded paragraphs for which text parsing failed: {failed_parsing}")

Rows matching on both matchingKey and paragraphNumber: 1819
Rows matching on both matchingKey and paragraphNumber for multi-coded paragraphs: 256
Number of coded paragraphs for which text parsing failed: 29


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

merged_data = pd.merge(
    para_data,
    report_data,
    on=["matchingKey", "paragraphNumber"],
    how="left"
)

merged_data.to_csv("../data/merged_data.csv", index=False)

# Check the shape of the merged data
print(f"Shape of merged data: {merged_data.shape}")

vectorizer = CountVectorizer(stop_words='english', min_df=5, max_df=0.95)
X = vectorizer.fit_transform(merged_data["paragraph"])

# tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.95)
# X_tfidf = tfidf_vectorizer.fit_transform(merged_data['paragraph'])

df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(f"Number of features for Count Vectorizer: {df_bow.shape[1]}")
# print(f"Number of features for TF-IDF: {df_tfidf.shape[1]}")

Shape of merged data: (6029, 12)
Number of features for Count Vectorizer: 6144


## Iterative Stratification K-fold CV

To make the most of our limited multi-label data, we use Iterative Stratification K-fold CV across all our models.

In [5]:

from skmultilearn.model_selection import IterativeStratification

Y = merged_data[target_categories].fillna(False).astype(int).values

# Set up Iterative Stratification
n_splits = 5
stratifier = IterativeStratification(n_splits=n_splits, order=1)



## Logistic Model + Random Forest using BoW
Now, we can finally calculate our first two models.

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, multilabel_confusion_matrix, precision_score, recall_score

best_params_rf = {'estimator__max_depth': None, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 75}

rf_params = {k.replace('estimator__', ''): v for k, v in best_params_rf.items()}

models = {
    "Logistic Regression": OneVsRestClassifier(LogisticRegression(max_iter=1000, C= 1.0, penalty='l1', solver='liblinear')),
    "Balanced Logistic Regression": OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=1000, C= 0.1, penalty='l1', solver='liblinear')),
    "Random Forest": RandomForestClassifier(**rf_params)
}

label_names = merged_data[target_categories].columns.tolist()

results = []

for model_name, model in models.items():
    print(f"\n--- {model_name} ---")

    # Store metrics per fold, per label
    fold_metrics = []

    f1_scores = []

    for train_idx, test_idx in stratifier.split(X, Y):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]

        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)

        Y_pred = Y_pred.toarray() if hasattr(Y_pred, 'toarray') else Y_pred

        f1_micro = f1_score(Y_test, Y_pred, average='micro')  # or 'macro'
        f1_macro = f1_score(Y_test, Y_pred, average='macro')
        # save to display later
        f1_scores.append({'f1_micro': f1_micro, 'f1_macro': f1_macro})
        f1_scores_df = pd.DataFrame(f1_scores)

        # Compute confusion matrices per label
        cm = multilabel_confusion_matrix(Y_test, Y_pred)
        precision_per_label = precision_score(Y_test, Y_pred, average=None, zero_division=0)
        recall_per_label = recall_score(Y_test, Y_pred, average=None, zero_division=0)

        # Show false posotives and false negatives per label
        for i, label in enumerate(label_names):
            tn, fp, fn, tp = cm[i].ravel()

            # Use sklearn's calculated metrics
            precision = precision_per_label[i]
            recall = recall_per_label[i]

            fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
            fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
            tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
            tnr = tn / (tn + fp) if (tn + fp) > 0 else 0

            fold_metrics.append({
                'Model': model_name,
                'Fold': len(fold_metrics) // len(label_names) + 1,
                'Label': label,
                'F1_micro': f1_micro,
                'F1_macro': f1_macro,
                'Precision': precision,
                'Recall': recall,
                'FPR': fpr,
                'FNR': fnr,
                'TPR': tpr,
                'TNR': tnr,
                'TP': tp,
                'FP': fp,
                'FN': fn,
                'TN': tn
            })

        # Convert per-fold metrics to DataFrame
        fold_metrics_df = pd.DataFrame(fold_metrics)

        # Append summary to global results list
        results.append(fold_metrics_df)

        print(f"Average F1 (micro): {np.mean(f1_scores_df.f1_micro):.4f}")
        print(f"Average F1 (macro): {np.mean(f1_scores_df.f1_macro):.4f}")
        print(classification_report(Y_test, Y_pred, target_names=label_names))


    # Combine all models into one DataFrame
    final_results_df = pd.concat(results, ignore_index=True)

    # Save to CSV
    final_results_df.to_csv('../out/model_performance_summary.csv', index=False)

    ## somehow still duplicates in the .csv, I will just filter when I read them in for the report



--- Logistic Regression ---
Average F1 (micro): 0.5042
Average F1 (macro): 0.4956
                                 precision    recall  f1-score   support

                   PoliceReform       0.61      0.46      0.52       105
Operations_PatrolsInterventions       0.69      0.60      0.64        30
            StateAdministration       0.53      0.43      0.47        44
              RefugeeAssistance       0.44      0.35      0.39        20
             ElectionAssistance       0.92      0.50      0.65        22
                    LegalReform       0.64      0.27      0.38        26
         CivilSocietyAssistance       0.52      0.34      0.42        32

                      micro avg       0.60      0.43      0.50       279
                      macro avg       0.62      0.42      0.50       279
                   weighted avg       0.61      0.43      0.50       279
                    samples avg       0.08      0.08      0.08       279



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.4887
Average F1 (macro): 0.4735
                                 precision    recall  f1-score   support

                   PoliceReform       0.60      0.43      0.50       105
Operations_PatrolsInterventions       0.59      0.53      0.56        30
            StateAdministration       0.55      0.40      0.46        43
              RefugeeAssistance       0.50      0.20      0.29        20
             ElectionAssistance       0.53      0.43      0.48        23
                    LegalReform       0.50      0.40      0.44        25
         CivilSocietyAssistance       0.46      0.41      0.43        32

                      micro avg       0.55      0.41      0.47       278
                      macro avg       0.53      0.40      0.45       278
                   weighted avg       0.55      0.41      0.47       278
                    samples avg       0.08      0.07      0.07       278



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.4894
Average F1 (macro): 0.4635
                                 precision    recall  f1-score   support

                   PoliceReform       0.58      0.51      0.55       105
Operations_PatrolsInterventions       0.64      0.47      0.54        30
            StateAdministration       0.73      0.50      0.59        44
              RefugeeAssistance       0.50      0.38      0.43        21
             ElectionAssistance       0.60      0.27      0.38        22
                    LegalReform       0.33      0.15      0.21        26
         CivilSocietyAssistance       0.50      0.34      0.41        32

                      micro avg       0.58      0.42      0.49       280
                      macro avg       0.55      0.38      0.44       280
                   weighted avg       0.57      0.42      0.48       280
                    samples avg       0.08      0.08      0.08       280



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.4856
Average F1 (macro): 0.4561
                                 precision    recall  f1-score   support

                   PoliceReform       0.65      0.50      0.57       105
Operations_PatrolsInterventions       0.54      0.47      0.50        30
            StateAdministration       0.45      0.32      0.37        44
              RefugeeAssistance       0.50      0.33      0.40        21
             ElectionAssistance       0.33      0.23      0.27        22
                    LegalReform       0.62      0.38      0.48        26
         CivilSocietyAssistance       0.52      0.39      0.45        33

                      micro avg       0.56      0.41      0.47       281
                      macro avg       0.52      0.38      0.43       281
                   weighted avg       0.56      0.41      0.47       281
                    samples avg       0.07      0.07      0.07       281



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.4903
Average F1 (macro): 0.4643
                                 precision    recall  f1-score   support

                   PoliceReform       0.56      0.46      0.51       105
Operations_PatrolsInterventions       0.89      0.55      0.68        31
            StateAdministration       0.64      0.48      0.55        44
              RefugeeAssistance       0.50      0.29      0.36        21
             ElectionAssistance       0.55      0.52      0.53        23
                    LegalReform       0.58      0.27      0.37        26
         CivilSocietyAssistance       0.54      0.44      0.48        32

                      micro avg       0.60      0.44      0.51       282
                      macro avg       0.61      0.43      0.50       282
                   weighted avg       0.60      0.44      0.51       282
                    samples avg       0.09      0.08      0.08       282


--- Balanced Logistic Regression ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.4519
Average F1 (macro): 0.4273
                                 precision    recall  f1-score   support

                   PoliceReform       0.40      0.76      0.53       105
Operations_PatrolsInterventions       0.37      0.74      0.49        31
            StateAdministration       0.28      0.68      0.39        44
              RefugeeAssistance       0.27      0.57      0.37        21
             ElectionAssistance       0.30      0.68      0.42        22
                    LegalReform       0.23      0.58      0.33        26
         CivilSocietyAssistance       0.32      0.84      0.46        32

                      micro avg       0.33      0.72      0.45       281
                      macro avg       0.31      0.69      0.43       281
                   weighted avg       0.34      0.72      0.46       281
                    samples avg       0.12      0.14      0.12       281



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.4723
Average F1 (macro): 0.4504
                                 precision    recall  f1-score   support

                   PoliceReform       0.46      0.80      0.58       105
Operations_PatrolsInterventions       0.48      0.83      0.61        30
            StateAdministration       0.30      0.66      0.41        44
              RefugeeAssistance       0.30      0.90      0.44        20
             ElectionAssistance       0.26      0.77      0.39        22
                    LegalReform       0.28      0.81      0.42        26
         CivilSocietyAssistance       0.31      0.81      0.45        32

                      micro avg       0.36      0.79      0.49       279
                      macro avg       0.34      0.80      0.47       279
                   weighted avg       0.38      0.79      0.50       279
                    samples avg       0.12      0.14      0.12       279



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.4613
Average F1 (macro): 0.4443
                                 precision    recall  f1-score   support

                   PoliceReform       0.39      0.73      0.51       105
Operations_PatrolsInterventions       0.38      0.80      0.52        30
            StateAdministration       0.21      0.55      0.30        44
              RefugeeAssistance       0.34      0.81      0.48        21
             ElectionAssistance       0.28      0.65      0.39        23
                    LegalReform       0.29      0.76      0.42        25
         CivilSocietyAssistance       0.27      0.81      0.41        32

                      micro avg       0.32      0.72      0.44       280
                      macro avg       0.31      0.73      0.43       280
                   weighted avg       0.33      0.72      0.45       280
                    samples avg       0.11      0.13      0.12       280



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.4648
Average F1 (macro): 0.4458
                                 precision    recall  f1-score   support

                   PoliceReform       0.47      0.80      0.59       105
Operations_PatrolsInterventions       0.42      0.80      0.55        30
            StateAdministration       0.27      0.79      0.40        43
              RefugeeAssistance       0.26      0.67      0.38        21
             ElectionAssistance       0.28      0.83      0.42        23
                    LegalReform       0.28      0.73      0.40        26
         CivilSocietyAssistance       0.27      0.75      0.40        32

                      micro avg       0.34      0.78      0.48       280
                      macro avg       0.32      0.77      0.45       280
                   weighted avg       0.36      0.78      0.49       280
                    samples avg       0.13      0.15      0.13       280



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.4655
Average F1 (macro): 0.4455
                                 precision    recall  f1-score   support

                   PoliceReform       0.43      0.74      0.54       105
Operations_PatrolsInterventions       0.35      0.67      0.46        30
            StateAdministration       0.36      0.75      0.49        44
              RefugeeAssistance       0.29      0.80      0.43        20
             ElectionAssistance       0.26      0.73      0.38        22
                    LegalReform       0.26      0.69      0.38        26
         CivilSocietyAssistance       0.30      0.79      0.43        33

                      micro avg       0.34      0.74      0.47       280
                      macro avg       0.32      0.74      0.44       280
                   weighted avg       0.35      0.74      0.48       280
                    samples avg       0.12      0.13      0.12       280


--- Random Forest ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.1772
Average F1 (macro): 0.1295
                                 precision    recall  f1-score   support

                   PoliceReform       0.88      0.13      0.23       105
Operations_PatrolsInterventions       0.80      0.13      0.22        31
            StateAdministration       0.89      0.18      0.30        44
              RefugeeAssistance       1.00      0.05      0.10        20
             ElectionAssistance       0.00      0.00      0.00        22
                    LegalReform       0.00      0.00      0.00        25
         CivilSocietyAssistance       0.33      0.03      0.06        33

                      micro avg       0.78      0.10      0.18       280
                      macro avg       0.56      0.07      0.13       280
                   weighted avg       0.67      0.10      0.17       280
                    samples avg       0.02      0.02      0.02       280



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.1819
Average F1 (macro): 0.1365
                                 precision    recall  f1-score   support

                   PoliceReform       0.94      0.14      0.25       105
Operations_PatrolsInterventions       1.00      0.13      0.24        30
            StateAdministration       0.88      0.16      0.27        44
              RefugeeAssistance       1.00      0.05      0.10        20
             ElectionAssistance       1.00      0.04      0.08        23
                    LegalReform       1.00      0.04      0.07        26
         CivilSocietyAssistance       0.00      0.00      0.00        32

                      micro avg       0.94      0.10      0.19       280
                      macro avg       0.83      0.08      0.14       280
                   weighted avg       0.84      0.10      0.18       280
                    samples avg       0.02      0.02      0.02       280



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.1773
Average F1 (macro): 0.1293
                                 precision    recall  f1-score   support

                   PoliceReform       0.89      0.15      0.26       105
Operations_PatrolsInterventions       1.00      0.17      0.29        30
            StateAdministration       1.00      0.09      0.17        44
              RefugeeAssistance       1.00      0.05      0.09        21
             ElectionAssistance       0.00      0.00      0.00        23
                    LegalReform       0.00      0.00      0.00        26
         CivilSocietyAssistance       0.00      0.00      0.00        32

                      micro avg       0.93      0.09      0.17       281
                      macro avg       0.56      0.07      0.11       281
                   weighted avg       0.67      0.09      0.16       281
                    samples avg       0.02      0.02      0.02       281



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.1855
Average F1 (macro): 0.1308
                                 precision    recall  f1-score   support

                   PoliceReform       0.95      0.17      0.29       105
Operations_PatrolsInterventions       1.00      0.17      0.29        30
            StateAdministration       1.00      0.23      0.37        44
              RefugeeAssistance       0.00      0.00      0.00        21
             ElectionAssistance       0.00      0.00      0.00        22
                    LegalReform       0.00      0.00      0.00        26
         CivilSocietyAssistance       0.00      0.00      0.00        32

                      micro avg       0.97      0.12      0.21       280
                      macro avg       0.42      0.08      0.14       280
                   weighted avg       0.62      0.12      0.20       280
                    samples avg       0.02      0.02      0.02       280



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.1834
Average F1 (macro): 0.1308
                                 precision    recall  f1-score   support

                   PoliceReform       0.83      0.14      0.24       105
Operations_PatrolsInterventions       1.00      0.20      0.33        30
            StateAdministration       1.00      0.07      0.13        43
              RefugeeAssistance       1.00      0.05      0.09        21
             ElectionAssistance       0.00      0.00      0.00        22
                    LegalReform       0.00      0.00      0.00        26
         CivilSocietyAssistance       1.00      0.06      0.12        32

                      micro avg       0.90      0.10      0.17       279
                      macro avg       0.69      0.07      0.13       279
                   weighted avg       0.77      0.10      0.17       279
                    samples avg       0.02      0.02      0.02       279



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Grid Search for best parameters for logistic regression

In [None]:
from sklearn.model_selection import GridSearchCV

# Logistic Regression Hyperparameter Grid
param_grid_lr = {
    'estimator__C': [0.1, 1.0, 10.0],
    'estimator__penalty': ['l1', 'l2'],
    'estimator__solver': ['liblinear']  # Only solver supporting l1
}

# Model wrapper
base_lr = OneVsRestClassifier(LogisticRegression(max_iter=1000))
balanced_lr = OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=1000))

# For storing metrics
# all_f1_scores = []

# # Grid Search for base LR
# for fold_idx, (train_idx, test_idx) in enumerate(stratifier.split(X, Y)):
#     print(f"\n📂 Fold {fold_idx + 1}/{n_splits}")

#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = Y[train_idx], Y[test_idx]

#     # Grid search
#     grid_search = GridSearchCV(base_lr, param_grid_lr, cv=3, scoring='f1_micro', verbose=1, n_jobs=-1)
#     grid_search.fit(X_train, y_train)

#     best_model = grid_search.best_estimator_
#     print(f"Best params for Base LR: {grid_search.best_params_}")

#     y_pred = best_model.predict(X_test)
#     y_pred = y_pred.toarray() if hasattr(y_pred, "toarray") else y_pred

#     f1 = f1_score(y_test, y_pred, average="micro")
#     all_f1_scores.append(f1)

#     print(f"F1 Micro: {f1:.4f}")
#     print(classification_report(y_test, y_pred, target_names=label_names))

# print(f"\n✅ Average F1 Micro over all folds: {np.mean(all_f1_scores):.4f}")

# # Grid Search for base LR
# for fold_idx, (train_idx, test_idx) in enumerate(stratifier.split(X, Y)):
#     print(f"\n📂 Fold {fold_idx + 1}/{n_splits}")

#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = Y[train_idx], Y[test_idx]

#     # Grid search
#     grid_search = GridSearchCV(balanced_lr, param_grid_lr, cv=3, scoring='f1_micro', verbose=1, n_jobs=-1)
#     grid_search.fit(X_train, y_train)

#     best_model = grid_search.best_estimator_
#     print(f"Best params for Balanced LR: {grid_search.best_params_}")

#     y_pred = best_model.predict(X_test)
#     y_pred = y_pred.toarray() if hasattr(y_pred, "toarray") else y_pred

#     f1 = f1_score(y_test, y_pred, average="micro")
#     all_f1_scores.append(f1)

#     print(f"F1 Micro: {f1:.4f}")
#     print(classification_report(y_test, y_pred, target_names=label_names))

# print(f"\n✅ Average F1 Micro over all folds: {np.mean(all_f1_scores):.4f}")

grid_search_base_lr = GridSearchCV(
    OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    param_grid_lr,
    cv=stratifier,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=1
)

grid_search_base_lr.fit(X, Y)
print("✅ Best parameters (global) for base LR:", grid_search_base_lr.best_params_)

grid_search_balanced_lr = GridSearchCV(
    OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=1000)),
    param_grid_lr,
    cv=stratifier,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=1
)

grid_search_balanced_lr.fit(X, Y)
print("✅ Best parameters (global) for balanced LR:", grid_search_balanced_lr.best_params_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
✅ Best parameters (global) for base LR: {'estimator__C': 1.0, 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
✅ Best parameters (global) for balanced LR: {'estimator__C': 0.1, 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear'}


#### Grid Search for Random Forest Hyperparameters

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

# Random Forest Hyperparameter Grid
param_grid_rf = {
    'estimator__n_estimators': [50, 75, 100, 125, 150, 175, 200],
    'estimator__max_depth': [None, 10, 20, 30, 40, 50],
    'estimator__min_samples_split': [2, 5, 10]
}

# Model wrapper
base_rf = OneVsRestClassifier(RandomForestClassifier())

# # For storing metrics
# all_f1_scores = []

# # Grid Search for base RF
# for fold_idx, (train_idx, test_idx) in enumerate(stratifier.split(X, Y)):
#     print(f"\n📂 Fold {fold_idx + 1}/{n_splits}")

#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = Y[train_idx], Y[test_idx]

#     # Grid search
#     grid_search = GridSearchCV(base_rf, param_grid_rf, cv=3, scoring='f1_micro', verbose=1, n_jobs=-1)
#     grid_search.fit(X_train, y_train)

#     best_model = grid_search.best_estimator_
#     print(f"Best params for Random Forest: {grid_search.best_params_}")

#     y_pred = best_model.predict(X_test)
#     y_pred = y_pred.toarray() if hasattr(y_pred, "toarray") else y_pred

#     f1 = f1_score(y_test, y_pred, average="micro")
#     all_f1_scores.append(f1)

#     print(f"F1 Micro: {f1:.4f}")
#     print(classification_report(y_test, y_pred, target_names=label_names))

# print(f"\n✅ Average F1 Micro over all folds: {np.mean(all_f1_scores):.4f}")

grid_search_rf = GridSearchCV(
    OneVsRestClassifier(RandomForestClassifier()),
    param_grid_rf,
    cv=stratifier,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=1
)

grid_search_rf.fit(X, Y)
print("✅ Best parameters (global) for Random Forest:", grid_search_rf.best_params_)


Fitting 5 folds for each of 126 candidates, totalling 630 fits
✅ Best parameters (global) for Random Forest: {'estimator__max_depth': None, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 200}
