## Baseline model (Bag of Words approach)

Naive model using a non-contextual model

In [1]:
import sys
import os

# Get the current directory
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")

# Add the parent directory to the path if needed
parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
sys.path.append(parent_dir)
print(f"Added to path: {parent_dir}")

Current working directory: c:\Users\felix\Documents\GIT\Hertie\PACT-ML\modules
Added to path: c:\Users\felix\Documents\GIT\Hertie\PACT-ML


## Data preparation

In [2]:
import pandas as pd
from pyreadr import read_r
from modules.helpers.validity_check import fuzzy_match_report_key

para_data = pd.read_csv("..\data\PACT_paragraphs_training.csv")

report_data = pd.read_csv("..\data\paragraphs.csv", sep=';')

report_data["matchingKey"] = report_data["report_namePKO"].str.replace('/', '_')

# reduced data to 7 target categories with most codings
target_categories = [
    "PoliceReform",
    "Operations_PatrolsInterventions",
    "StateAdministration",
    "RefugeeAssistance",
    "ElectionAssistance",
    "LegalReform",
    "CivilSocietyAssistance"
]

report_data = report_data[["matchingKey", "paragraphNumber"] + target_categories]

report_data[target_categories] = report_data[target_categories].map(lambda x: isinstance(x, str))

# check if a paragraph led to two codings
multi_coded = report_data.copy()
multi_coded["num_labels"] = report_data[target_categories].sum(axis=1)

multi_coded = multi_coded[multi_coded["num_labels"] > 1]

print(f"{len(multi_coded)} paragraphs have multiple codings.")


472 paragraphs have multiple codings.


In [None]:

# number of coded paragraphs in PACT2.0 that have text in the training data
merged = pd.merge(
    report_data[["matchingKey", "paragraphNumber"]],
    para_data[["matchingKey", "paragraphNumber"]],
    on=["matchingKey", "paragraphNumber"],
    how="left",
    indicator=True
)

matching_pairs = (merged["_merge"] == "both").sum()
print(f"Rows matching on both matchingKey and paragraphNumber: {matching_pairs}")

# number of multi-label paragraphs in PACT2.0 that have text in the training data
merged_multi = pd.merge(
    multi_coded[["matchingKey", "paragraphNumber"]],
    para_data[["matchingKey", "paragraphNumber"]],
    on=["matchingKey", "paragraphNumber"],
    how="left",
    indicator=True
)

matching_pairs_multi = (merged_multi["_merge"] == "both").sum()
print(f"Rows matching on both matchingKey and paragraphNumber for multi-coded paragraphs: {matching_pairs_multi}")

# number of paragraphs for which parsing failed
failed_parsing = report_data["matchingKey"].isin(para_data["matchingKey"]).sum() - matching_pairs
print(f"Number of coded paragraphs for which text parsing failed: {failed_parsing}")

NameError: name 'merged_data' is not defined

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

merged_data = pd.merge(
    para_data,
    report_data,
    on=["matchingKey", "paragraphNumber"],
    how="inner"
)

merged_data.to_csv("../data/merged_data.csv", index=False)

# Check the shape of the merged data
print(f"Shape of merged data: {merged_data.shape}")

vectorizer = CountVectorizer(stop_words='english', min_df=5, max_df=0.95)
X = vectorizer.fit_transform(merged_data["paragraph"])

tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.95)
X_tfidf = tfidf_vectorizer.fit_transform(merged_data['paragraph'])

df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(f"Number of features for Count Vectorizer: {df_bow.shape[1]}")
print(f"Number of features for TF-IDF: {df_tfidf.shape[1]}")

# MAYBE USE STATIFIED K-FOLD CV
# from sklearn.model_selection import StratifiedKFold
# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

Shape of merged data: (1819, 12)
Number of features for Count Vectorizer: 2989
Number of features for TF-IDF: 2989


## Iterative Stratification K-fold CV

To make the most of our limited multi-label data, we use Iterative Stratification K-fold CV across all our models.

In [7]:

from skmultilearn.model_selection import IterativeStratification

Y = merged_data[target_categories].values

# Set up Iterative Stratification
n_splits = 5
stratifier = IterativeStratification(n_splits=n_splits, order=1)

In [12]:
y = merged_data[target_categories]
class_counts = y.sum()
total_samples = len(y)
class_distribution = class_counts / total_samples

# Create a markdown table
print("## Class Distribution\n")
print("| Category | Count | Proportion | Percentage |")
print("|----------|-------|------------|------------|")
for category, proportion in class_distribution.items():
    count = class_counts[category]
    print(f"| {category} | {count} | {proportion:.4f} | {proportion*100:.2f}% |")

# Add summary row
print(f"| **Total** | {total_samples} | 1.0000 | 100.00% |")

# Calculate and add imbalance ratio
most_frequent = class_distribution.max()
least_frequent = class_distribution.min()
imbalance_ratio = most_frequent / least_frequent
print(f"\n**Imbalance ratio (most frequent / least frequent)**: {imbalance_ratio:.2f}")

## Class Distribution

| Category | Count | Proportion | Percentage |
|----------|-------|------------|------------|
| PoliceReform | 525 | 0.2886 | 28.86% |
| Operations_PatrolsInterventions | 151 | 0.0830 | 8.30% |
| StateAdministration | 219 | 0.1204 | 12.04% |
| RefugeeAssistance | 103 | 0.0566 | 5.66% |
| ElectionAssistance | 112 | 0.0616 | 6.16% |
| LegalReform | 129 | 0.0709 | 7.09% |
| CivilSocietyAssistance | 161 | 0.0885 | 8.85% |
| **Total** | 1819 | 1.0000 | 100.00% |

**Imbalance ratio (most frequent / least frequent)**: 5.10


In [17]:
full_PACT = pd.read_csv("..\data\paragraphs.csv", sep=';')

exclude_columns = ["report_namePKO", "paragraphNumber", "paragraph_ID"]
potential_categories = [col for col in full_PACT.columns if col not in exclude_columns]

# Check which of these columns contain boolean-like data
all_categories = []
for col in potential_categories:
    # Check if column contains only boolean values, or strings, or NaNs
    unique_values = full_PACT[col].dropna().unique()
    if all(isinstance(x, bool) for x in unique_values) or \
       all(isinstance(x, str) for x in unique_values) or \
       len(unique_values) <= 2:  # Assuming binary categories
        all_categories.append(col)

print(f"Detected {len(all_categories)} category columns")

# Convert string values to boolean if needed
for category in all_categories:
    full_PACT[category] = full_PACT[category].map(lambda x: isinstance(x, str) if pd.notna(x) else False)

# Calculate counts and distribution
class_counts = full_PACT[all_categories].sum()
total_samples = len(full_PACT)
class_distribution = class_counts / total_samples

# Sort categories by frequency for better readability
sorted_categories = class_counts.sort_values(ascending=False).index.tolist()

# Create a markdown table
print("## Class Distribution in paragraphs.csv (All Categories)\n")
print("| Category | Count | Proportion | Percentage |")
print("|----------|-------|------------|------------|")
for category in sorted_categories:
    count = class_counts[category]
    proportion = class_distribution[category]
    print(f"| {category} | {count} | {proportion:.4f} | {proportion*100:.2f}% |")

# Add summary row
print(f"| **Total Documents** | {total_samples} | - | - |")

# Calculate and add imbalance ratio
most_frequent = class_distribution.max()
least_frequent = class_distribution.min()
imbalance_ratio = most_frequent / least_frequent
print(f"\n**Imbalance ratio (most frequent / least frequent)**: {imbalance_ratio:.2f}")

Detected 37 category columns
## Class Distribution in paragraphs.csv (All Categories)

| Category | Count | Proportion | Percentage |
|----------|-------|------------|------------|
| PoliceReform | 1232 | 0.2532 | 25.32% |
| Operations_PatrolsInterventions | 587 | 0.1207 | 12.07% |
| StateAdministration | 581 | 0.1194 | 11.94% |
| HumanRights | 450 | 0.0925 | 9.25% |
| JusticeSectorReform | 435 | 0.0894 | 8.94% |
| Demilitarization | 350 | 0.0719 | 7.19% |
| RefugeeAssistance | 312 | 0.0641 | 6.41% |
| ElectionAssistance | 265 | 0.0545 | 5.45% |
| BorderControl | 251 | 0.0516 | 5.16% |
| MilitaryReform | 247 | 0.0508 | 5.08% |
| LegalReform | 235 | 0.0483 | 4.83% |
| CivilSocietyAssistance | 232 | 0.0477 | 4.77% |
| PrisonReform | 228 | 0.0469 | 4.69% |
| HumanitarianRelief | 222 | 0.0456 | 4.56% |
| Gender | 209 | 0.0430 | 4.30% |
| PartyAssistance | 178 | 0.0366 | 3.66% |
| DemocraticInstitutions | 144 | 0.0296 | 2.96% |
| SexualViolence | 142 | 0.0292 | 2.92% |
| ControlSALW | 133 |



## Logistic Model + Random Forest using BoW
Now, we can finally calculate our first two models.

In [67]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, multilabel_confusion_matrix

best_params_rf = {'estimator__max_depth': None, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 75}

rf_params = {k.replace('estimator__', ''): v for k, v in best_params_rf.items()}

models = {
    "Logistic Regression": OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    "Balanced Logistic Regression": OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=1000)),
    "Logistic Regression (hyperparameter optimized)": OneVsRestClassifier(LogisticRegression(max_iter=1000, C= 1.0, penalty='l1', solver='liblinear')),
    "Balanced Logistic Regression (hyperparameter optimized)": OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=1000, C= 0.1, penalty='l1', solver='liblinear')),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Random Forest (hyperparameter optimized)": RandomForestClassifier(**rf_params)
}

label_names = merged_data[target_categories].columns.tolist()

results = []

for model_name, model in models.items():
    print(f"\n--- {model_name} ---")

    # Store metrics per fold, per label
    fold_metrics = []

    for train_idx, test_idx in stratifier.split(X, Y):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]

        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)

        Y_pred = Y_pred.toarray() if hasattr(Y_pred, 'toarray') else Y_pred

        f1_micro = f1_score(Y_test, Y_pred, average='micro')  # or 'macro'
        f1_macro = f1_score(Y_test, Y_pred, average='macro')
        f1_scores.append({'f1_micro': f1_micro, 'f1_macro': f1_macro})
        f1_scores_df = pd.DataFrame(f1_scores)

        # Compute confusion matrices per label
        cm = multilabel_confusion_matrix(Y_test, Y_pred)

        rows = []

        # Show false posotives and false negatives per label
        for i, label in enumerate(label_names):
            tn, fp, fn, tp = cm[i].ravel()
            fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
            fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
            tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
            tnr = tn / (tn + fp) if (tn + fp) > 0 else 0

            fold_metrics.append({
                'Model': model_name,
                'Fold': len(fold_metrics) // len(label_names) + 1,
                'Label': label,
                'F1_micro': f1_micro,
                'F1_macro': f1_macro,
                'FPR': fpr,
                'FNR': fnr,
                'TPR': tpr,
                'TNR': tnr,
                'TP': tp,
                'FP': fp,
                'FN': fn,
                'TN': tn
            })

        # Convert per-fold metrics to DataFrame
        fold_metrics_df = pd.DataFrame(fold_metrics)

        # Aggregate over folds: mean per model and label
        summary_df = fold_metrics_df.groupby(['Model', 'Label']).agg({
            'F1_micro': 'mean',
            'F1_macro': 'mean',
            'FPR': 'mean',
            'FNR': 'mean',
            'TPR': 'mean',
            'TNR': 'mean',
        }).reset_index()

        # Append summary to global results list
        results.append(summary_df)

        print(f"Average F1 (micro): {np.mean(f1_scores_df.f1_micro):.4f}")
        print(f"Average F1 (macro): {np.mean(f1_scores_df.f1_macro):.4f}")
        print(classification_report(Y_test, Y_pred, target_names=label_names))


    # Combine all models into one DataFrame
    final_results_df = pd.concat(results, ignore_index=True)

    # Save to CSV
    final_results_df.to_csv('../out/model_performance_summary.csv', index=False)



--- Logistic Regression ---
Average F1 (micro): 0.6265
Average F1 (macro): 0.5678


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                 precision    recall  f1-score   support

                   PoliceReform       0.82      0.70      0.76       105
Operations_PatrolsInterventions       0.87      0.43      0.58        30
            StateAdministration       0.68      0.59      0.63        44
              RefugeeAssistance       0.92      0.52      0.67        21
             ElectionAssistance       0.83      0.43      0.57        23
                    LegalReform       0.58      0.27      0.37        26
         CivilSocietyAssistance       0.62      0.64      0.63        33

                      micro avg       0.76      0.57      0.65       282
                      macro avg       0.76      0.51      0.60       282
                   weighted avg       0.77      0.57      0.65       282
                    samples avg       0.38      0.34      0.35       282

Average F1 (micro): 0.6269
Average F1 (macro): 0.5685
                                 precision    recall  f1-score   su

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6279
Average F1 (macro): 0.5699
                                 precision    recall  f1-score   support

                   PoliceReform       0.84      0.71      0.77       105
Operations_PatrolsInterventions       0.89      0.53      0.67        30
            StateAdministration       0.78      0.57      0.66        44
              RefugeeAssistance       1.00      0.43      0.60        21
             ElectionAssistance       0.92      0.55      0.69        22
                    LegalReform       0.86      0.24      0.38        25
         CivilSocietyAssistance       0.78      0.56      0.65        32

                      micro avg       0.84      0.58      0.69       279
                      macro avg       0.87      0.51      0.63       279
                   weighted avg       0.85      0.58      0.67       279
                    samples avg       0.38      0.36      0.37       279


--- Balanced Logistic Regression ---
Average F1 (micro): 0.6283
Av

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6289
Average F1 (macro): 0.5715
                                 precision    recall  f1-score   support

                   PoliceReform       0.73      0.75      0.74       105
Operations_PatrolsInterventions       0.71      0.80      0.75        30
            StateAdministration       0.56      0.64      0.60        44
              RefugeeAssistance       0.79      0.52      0.63        21
             ElectionAssistance       0.87      0.57      0.68        23
                    LegalReform       0.59      0.62      0.60        26
         CivilSocietyAssistance       0.79      0.67      0.72        33

                      micro avg       0.70      0.68      0.69       282
                      macro avg       0.72      0.65      0.68       282
                   weighted avg       0.71      0.68      0.69       282
                    samples avg       0.43      0.42      0.42       282

Average F1 (micro): 0.6293
Average F1 (macro): 0.5723
             

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6299
Average F1 (macro): 0.5733
                                 precision    recall  f1-score   support

                   PoliceReform       0.78      0.70      0.74       105
Operations_PatrolsInterventions       0.67      0.73      0.70        30
            StateAdministration       0.63      0.66      0.64        44
              RefugeeAssistance       0.81      0.62      0.70        21
             ElectionAssistance       0.81      0.95      0.88        22
                    LegalReform       0.62      0.38      0.48        26
         CivilSocietyAssistance       0.62      0.81      0.70        32

                      micro avg       0.71      0.69      0.70       280
                      macro avg       0.71      0.69      0.69       280
                   weighted avg       0.72      0.69      0.70       280
                    samples avg       0.42      0.42      0.41       280

Average F1 (micro): 0.6304
Average F1 (macro): 0.5741
             

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6309
Average F1 (macro): 0.5749
                                 precision    recall  f1-score   support

                   PoliceReform       0.76      0.70      0.73       105
Operations_PatrolsInterventions       0.79      0.63      0.70        30
            StateAdministration       0.64      0.63      0.64        43
              RefugeeAssistance       0.92      0.57      0.71        21
             ElectionAssistance       0.94      0.73      0.82        22
                    LegalReform       0.69      0.42      0.52        26
         CivilSocietyAssistance       0.65      0.67      0.66        33

                      micro avg       0.74      0.64      0.69       280
                      macro avg       0.77      0.62      0.68       280
                   weighted avg       0.75      0.64      0.69       280
                    samples avg       0.40      0.37      0.38       280

Average F1 (micro): 0.6312
Average F1 (macro): 0.5755
             

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                 precision    recall  f1-score   support

                   PoliceReform       0.73      0.70      0.71       105
Operations_PatrolsInterventions       0.75      0.58      0.65        31
            StateAdministration       0.68      0.59      0.63        44
              RefugeeAssistance       0.83      0.90      0.86        21
             ElectionAssistance       0.76      0.57      0.65        23
                    LegalReform       0.71      0.40      0.51        25
         CivilSocietyAssistance       0.73      0.50      0.59        32

                      micro avg       0.74      0.62      0.67       281
                      macro avg       0.74      0.61      0.66       281
                   weighted avg       0.73      0.62      0.67       281
                    samples avg       0.41      0.40      0.40       281

Average F1 (micro): 0.6319
Average F1 (macro): 0.5770
                                 precision    recall  f1-score   su

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6327
Average F1 (macro): 0.5786
                                 precision    recall  f1-score   support

                   PoliceReform       0.74      0.85      0.79       105
Operations_PatrolsInterventions       0.61      0.81      0.69        31
            StateAdministration       0.52      0.82      0.64        44
              RefugeeAssistance       0.75      0.86      0.80        21
             ElectionAssistance       0.55      0.78      0.64        23
                    LegalReform       0.53      0.92      0.68        26
         CivilSocietyAssistance       0.51      0.88      0.64        32

                      micro avg       0.61      0.84      0.71       282
                      macro avg       0.60      0.84      0.70       282
                   weighted avg       0.63      0.84      0.72       282
                    samples avg       0.47      0.51      0.47       282

Average F1 (micro): 0.6329
Average F1 (macro): 0.5792
             

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                 precision    recall  f1-score   support

                   PoliceReform       0.72      0.72      0.72       105
Operations_PatrolsInterventions       0.58      0.87      0.69        30
            StateAdministration       0.46      0.70      0.56        44
              RefugeeAssistance       0.76      0.80      0.78        20
             ElectionAssistance       0.56      0.86      0.68        22
                    LegalReform       0.57      0.81      0.67        26
         CivilSocietyAssistance       0.52      0.88      0.65        32

                      micro avg       0.60      0.78      0.68       279
                      macro avg       0.60      0.81      0.68       279
                   weighted avg       0.62      0.78      0.68       279
                    samples avg       0.44      0.47      0.44       279

Average F1 (micro): 0.6336
Average F1 (macro): 0.5808
                                 precision    recall  f1-score   su

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6314
Average F1 (macro): 0.5774
                                 precision    recall  f1-score   support

                   PoliceReform       0.82      0.63      0.71       105
Operations_PatrolsInterventions       0.91      0.32      0.48        31
            StateAdministration       1.00      0.20      0.34        44
              RefugeeAssistance       0.80      0.20      0.32        20
             ElectionAssistance       0.78      0.32      0.45        22
                    LegalReform       1.00      0.04      0.07        26
         CivilSocietyAssistance       0.75      0.09      0.17        32

                      micro avg       0.84      0.36      0.50       280
                      macro avg       0.87      0.26      0.36       280
                   weighted avg       0.86      0.36      0.46       280
                    samples avg       0.25      0.23      0.24       280



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6307
Average F1 (macro): 0.5763
                                 precision    recall  f1-score   support

                   PoliceReform       0.91      0.59      0.72       105
Operations_PatrolsInterventions       0.91      0.33      0.49        30
            StateAdministration       0.85      0.25      0.39        44
              RefugeeAssistance       0.75      0.30      0.43        20
             ElectionAssistance       1.00      0.52      0.69        23
                    LegalReform       1.00      0.08      0.14        26
         CivilSocietyAssistance       0.43      0.09      0.15        33

                      micro avg       0.88      0.38      0.53       281
                      macro avg       0.84      0.31      0.43       281
                   weighted avg       0.85      0.38      0.50       281
                    samples avg       0.28      0.26      0.26       281



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6296
Average F1 (macro): 0.5746
                                 precision    recall  f1-score   support

                   PoliceReform       0.82      0.58      0.68       105
Operations_PatrolsInterventions       0.85      0.37      0.51        30
            StateAdministration       1.00      0.30      0.46        43
              RefugeeAssistance       1.00      0.19      0.32        21
             ElectionAssistance       1.00      0.17      0.30        23
                    LegalReform       0.00      0.00      0.00        25
         CivilSocietyAssistance       1.00      0.09      0.17        32

                      micro avg       0.86      0.34      0.49       279
                      macro avg       0.81      0.24      0.35       279
                   weighted avg       0.83      0.34      0.45       279
                    samples avg       0.25      0.23      0.24       279



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6289
Average F1 (macro): 0.5731
                                 precision    recall  f1-score   support

                   PoliceReform       0.89      0.69      0.77       105
Operations_PatrolsInterventions       0.92      0.37      0.52        30
            StateAdministration       0.80      0.18      0.30        44
              RefugeeAssistance       0.75      0.14      0.24        21
             ElectionAssistance       1.00      0.23      0.37        22
                    LegalReform       1.00      0.15      0.27        26
         CivilSocietyAssistance       1.00      0.09      0.17        32

                      micro avg       0.89      0.38      0.53       280
                      macro avg       0.91      0.26      0.38       280
                   weighted avg       0.90      0.38      0.48       280
                    samples avg       0.27      0.24      0.25       280


--- Random Forest (hyperparameter optimized) ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6281
Average F1 (macro): 0.5719
                                 precision    recall  f1-score   support

                   PoliceReform       0.82      0.61      0.70       105
Operations_PatrolsInterventions       1.00      0.39      0.56        31
            StateAdministration       1.00      0.16      0.27        44
              RefugeeAssistance       0.78      0.33      0.47        21
             ElectionAssistance       0.80      0.35      0.48        23
                    LegalReform       1.00      0.12      0.21        26
         CivilSocietyAssistance       1.00      0.09      0.17        32

                      micro avg       0.85      0.37      0.51       282
                      macro avg       0.91      0.29      0.41       282
                   weighted avg       0.90      0.37      0.48       282
                    samples avg       0.26      0.23      0.24       282



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6275
Average F1 (macro): 0.5709
                                 precision    recall  f1-score   support

                   PoliceReform       0.87      0.69      0.77       105
Operations_PatrolsInterventions       0.93      0.43      0.59        30
            StateAdministration       0.82      0.20      0.33        44
              RefugeeAssistance       0.88      0.35      0.50        20
             ElectionAssistance       0.80      0.36      0.50        22
                    LegalReform       0.50      0.04      0.07        26
         CivilSocietyAssistance       0.71      0.15      0.25        33

                      micro avg       0.85      0.41      0.55       280
                      macro avg       0.79      0.32      0.43       280
                   weighted avg       0.81      0.41      0.51       280
                    samples avg       0.29      0.27      0.27       280



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6266
Average F1 (macro): 0.5694
                                 precision    recall  f1-score   support

                   PoliceReform       0.97      0.57      0.72       105
Operations_PatrolsInterventions       1.00      0.43      0.60        30
            StateAdministration       0.92      0.26      0.40        43
              RefugeeAssistance       0.70      0.33      0.45        21
             ElectionAssistance       1.00      0.18      0.31        22
                    LegalReform       0.00      0.00      0.00        26
         CivilSocietyAssistance       1.00      0.03      0.06        32

                      micro avg       0.94      0.34      0.50       279
                      macro avg       0.80      0.26      0.36       279
                   weighted avg       0.86      0.34      0.46       279
                    samples avg       0.25      0.23      0.24       279



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6260
Average F1 (macro): 0.5682
                                 precision    recall  f1-score   support

                   PoliceReform       0.85      0.66      0.74       105
Operations_PatrolsInterventions       0.84      0.53      0.65        30
            StateAdministration       1.00      0.25      0.40        44
              RefugeeAssistance       0.80      0.38      0.52        21
             ElectionAssistance       1.00      0.13      0.23        23
                    LegalReform       1.00      0.12      0.21        26
         CivilSocietyAssistance       1.00      0.03      0.06        32

                      micro avg       0.87      0.40      0.54       281
                      macro avg       0.93      0.30      0.40       281
                   weighted avg       0.91      0.40      0.49       281
                    samples avg       0.27      0.25      0.26       281



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average F1 (micro): 0.6252
Average F1 (macro): 0.5666
                                 precision    recall  f1-score   support

                   PoliceReform       0.85      0.64      0.73       105
Operations_PatrolsInterventions       0.90      0.30      0.45        30
            StateAdministration       0.88      0.32      0.47        44
              RefugeeAssistance       1.00      0.20      0.33        20
             ElectionAssistance       1.00      0.14      0.24        22
                    LegalReform       1.00      0.04      0.08        25
         CivilSocietyAssistance       0.75      0.09      0.17        32

                      micro avg       0.86      0.36      0.51       278
                      macro avg       0.91      0.25      0.35       278
                   weighted avg       0.88      0.36      0.47       278
                    samples avg       0.25      0.22      0.23       278



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Grid Search for best parameters for logistic regression

In [None]:
from sklearn.model_selection import GridSearchCV

# Logistic Regression Hyperparameter Grid
param_grid_lr = {
    'estimator__C': [0.1, 1.0, 10.0],
    'estimator__penalty': ['l1', 'l2'],
    'estimator__solver': ['liblinear']  # Only solver supporting l1
}

# Model wrapper
base_lr = OneVsRestClassifier(LogisticRegression(max_iter=1000))
balanced_lr = OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=1000))

# For storing metrics
# all_f1_scores = []

# # Grid Search for base LR
# for fold_idx, (train_idx, test_idx) in enumerate(stratifier.split(X, Y)):
#     print(f"\n📂 Fold {fold_idx + 1}/{n_splits}")

#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = Y[train_idx], Y[test_idx]

#     # Grid search
#     grid_search = GridSearchCV(base_lr, param_grid_lr, cv=3, scoring='f1_micro', verbose=1, n_jobs=-1)
#     grid_search.fit(X_train, y_train)

#     best_model = grid_search.best_estimator_
#     print(f"Best params for Base LR: {grid_search.best_params_}")

#     y_pred = best_model.predict(X_test)
#     y_pred = y_pred.toarray() if hasattr(y_pred, "toarray") else y_pred

#     f1 = f1_score(y_test, y_pred, average="micro")
#     all_f1_scores.append(f1)

#     print(f"F1 Micro: {f1:.4f}")
#     print(classification_report(y_test, y_pred, target_names=label_names))

# print(f"\n✅ Average F1 Micro over all folds: {np.mean(all_f1_scores):.4f}")

# # Grid Search for base LR
# for fold_idx, (train_idx, test_idx) in enumerate(stratifier.split(X, Y)):
#     print(f"\n📂 Fold {fold_idx + 1}/{n_splits}")

#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = Y[train_idx], Y[test_idx]

#     # Grid search
#     grid_search = GridSearchCV(balanced_lr, param_grid_lr, cv=3, scoring='f1_micro', verbose=1, n_jobs=-1)
#     grid_search.fit(X_train, y_train)

#     best_model = grid_search.best_estimator_
#     print(f"Best params for Balanced LR: {grid_search.best_params_}")

#     y_pred = best_model.predict(X_test)
#     y_pred = y_pred.toarray() if hasattr(y_pred, "toarray") else y_pred

#     f1 = f1_score(y_test, y_pred, average="micro")
#     all_f1_scores.append(f1)

#     print(f"F1 Micro: {f1:.4f}")
#     print(classification_report(y_test, y_pred, target_names=label_names))

# print(f"\n✅ Average F1 Micro over all folds: {np.mean(all_f1_scores):.4f}")

grid_search_base_lr = GridSearchCV(
    OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    param_grid_lr,
    cv=5,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=1
)

grid_search_base_lr.fit(X, Y)
print("✅ Best parameters (global) for base LR:", grid_search_base_lr.best_params_)

grid_search_balanced_lr = GridSearchCV(
    OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=1000)),
    param_grid_lr,
    cv=5,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=1
)

grid_search_balanced_lr.fit(X, Y)
print("✅ Best parameters (global) for balanced LR:", grid_search_balanced_lr.best_params_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
✅ Best parameters (global) for base LR: {'estimator__C': 1.0, 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
✅ Best parameters (global) for balanced LR: {'estimator__C': 0.1, 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear'}


#### Grid Search for Random Forest Hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

# Random Forest Hyperparameter Grid
param_grid_rf = {
    'estimator__n_estimators': [50, 75, 100, 125, 150, 175, 200],
    'estimator__max_depth': [None, 10, 20, 30, 40, 50],
    'estimator__min_samples_split': [2, 5, 10]
}

# Model wrapper
base_rf = OneVsRestClassifier(RandomForestClassifier())

# # For storing metrics
# all_f1_scores = []

# # Grid Search for base RF
# for fold_idx, (train_idx, test_idx) in enumerate(stratifier.split(X, Y)):
#     print(f"\n📂 Fold {fold_idx + 1}/{n_splits}")

#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = Y[train_idx], Y[test_idx]

#     # Grid search
#     grid_search = GridSearchCV(base_rf, param_grid_rf, cv=3, scoring='f1_micro', verbose=1, n_jobs=-1)
#     grid_search.fit(X_train, y_train)

#     best_model = grid_search.best_estimator_
#     print(f"Best params for Random Forest: {grid_search.best_params_}")

#     y_pred = best_model.predict(X_test)
#     y_pred = y_pred.toarray() if hasattr(y_pred, "toarray") else y_pred

#     f1 = f1_score(y_test, y_pred, average="micro")
#     all_f1_scores.append(f1)

#     print(f"F1 Micro: {f1:.4f}")
#     print(classification_report(y_test, y_pred, target_names=label_names))

# print(f"\n✅ Average F1 Micro over all folds: {np.mean(all_f1_scores):.4f}")

grid_search_rf = GridSearchCV(
    OneVsRestClassifier(RandomForestClassifier()),
    param_grid_rf,
    cv=10,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=1
)

grid_search_rf.fit(X, Y)
print("✅ Best parameters (global) for Random Forest:", grid_search_rf.best_params_)


Fitting 10 folds for each of 126 candidates, totalling 1260 fits
✅ Best parameters (global) for Random Forest: {'estimator__max_depth': None, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 75}


#### Check class distribution for eventual additional parameters

Since label distribution is unbalanced, the balanced_lr is included above.

In [None]:
# Check class distribution
class_distribution = y.sum() / len(y)
print("Class distribution:")
for i, category in enumerate(target_categories):
    print(f"{category}: {class_distribution[i]:.4f}")

# Use class_weight='balanced' for imbalanced data

Class distribution:
PoliceReform: 0.2886
Operations_PatrolsInterventions: 0.0830
StateAdministration: 0.1204
RefugeeAssistance: 0.0566
ElectionAssistance: 0.0616
LegalReform: 0.0709
CivilSocietyAssistance: 0.0885


