## Baseline model (Bag of Words approach)

Naive model using a non-contextual model

In [2]:
import sys
import os

# Get the current directory
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")

# Add the parent directory to the path if needed
parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
sys.path.append(parent_dir)
print(f"Added to path: {parent_dir}")

Current working directory: c:\Users\felix\Documents\GIT\Hertie\PACT-ML\modules
Added to path: c:\Users\felix\Documents\GIT\Hertie\PACT-ML


## Data preparation

In [5]:
import pandas as pd
from pyreadr import read_r
from modules.helpers.validity_check import fuzzy_match_report_key

para_data = pd.read_csv("..\data\PACT_paragraphs_training.csv")

report_data = pd.read_csv("..\data\paragraphs.csv", sep=';')

report_data["matchingKey"] = report_data["report_namePKO"].str.replace('/', '_')

# reduced data to 7 target categories with most codings
target_categories = [
    "PoliceReform",
    "Operations_PatrolsInterventions",
    "StateAdministration",
    "RefugeeAssistance",
    "ElectionAssistance",
    "LegalReform",
    "CivilSocietyAssistance"
]

report_data = report_data[["matchingKey", "paragraphNumber"] + target_categories]

report_data[target_categories] = report_data[target_categories].map(lambda x: isinstance(x, str))

# check if a paragraph led to two codings
multi_coded = report_data.copy()
multi_coded["num_labels"] = report_data[target_categories].sum(axis=1)

multi_coded = multi_coded[multi_coded["num_labels"] > 1]

print(f"{len(multi_coded)} paragraphs have multiple codings.")


472 paragraphs have multiple codings.


In [10]:

# number of coded paragraphs in PACT2.0 that have text in the training data
merged = pd.merge(
    report_data[["matchingKey", "paragraphNumber"]],
    para_data[["matchingKey", "paragraphNumber"]],
    on=["matchingKey", "paragraphNumber"],
    how="left",
    indicator=True
)

matching_pairs = (merged["_merge"] == "both").sum()
print(f"Rows matching on both matchingKey and paragraphNumber: {matching_pairs}")

# number of multi-label paragraphs in PACT2.0 that have text in the training data
merged_multi = pd.merge(
    multi_coded[["matchingKey", "paragraphNumber"]],
    para_data[["matchingKey", "paragraphNumber"]],
    on=["matchingKey", "paragraphNumber"],
    how="left",
    indicator=True
)

matching_pairs_multi = (merged_multi["_merge"] == "both").sum()
print(f"Rows matching on both matchingKey and paragraphNumber for multi-coded paragraphs: {matching_pairs_multi}")

# number of paragraphs for which parsing failed
failed_parsing = report_data["matchingKey"].isin(para_data["matchingKey"]).sum() - matching_pairs
print(f"Number of coded paragraphs for which text parsing failed: {failed_parsing}")

Rows matching on both matchingKey and paragraphNumber: 1819
Rows matching on both matchingKey and paragraphNumber for multi-coded paragraphs: 256
Number of coded paragraphs for which text parsing failed: 29


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

merged_data = pd.merge(
    para_data,
    report_data,
    on=["matchingKey", "paragraphNumber"],
    how="inner"
)

# Check the shape of the merged data
print(f"Shape of merged data: {merged_data.shape}")

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(merged_data["paragraph"])

df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(f"Number of features: {df_bow.shape[1]}")

# MAYBE USE STATIFIED K-FOLD CV
# from sklearn.model_selection import StratifiedKFold
# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

Shape of merged data: (1819, 12)
Number of features: 8241


## Iterative Stratification K-fold CV

To make the most of our limited multi-label data, we use Iterative Stratification K-fold CV across all our models.

In [13]:

from skmultilearn.model_selection import IterativeStratification

Y = merged_data[target_categories].values

# Set up Iterative Stratification
n_splits = 5
stratifier = IterativeStratification(n_splits=n_splits, order=1)

## Logistic Model + Random Forest using BoW
Now, we can finally calculate our first two models.

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, multilabel_confusion_matrix

models = {
    "Logistic Regression": OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    "Random Forest": RandomForestClassifier(n_estimators=100)
}

label_names = merged_data[target_categories].columns.tolist()

for model_name, model in models.items():
    print(f"\n--- {model_name} ---")
    f1_scores = []

    for train_idx, test_idx in stratifier.split(X, Y):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]

        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)

        Y_pred = Y_pred.toarray() if hasattr(Y_pred, 'toarray') else Y_pred

        f1 = f1_score(Y_test, Y_pred, average='micro')  # or 'macro'
        f1_scores.append(f1)

    print(f"Average F1 (micro): {np.mean(f1_scores):.4f}")
    print(classification_report(Y_test, Y_pred, target_names=label_names))
    # Compute confusion matrices per label
    cm = multilabel_confusion_matrix(Y_test, Y_pred)

    # Show false posotives and false negatives per label
    for i, label in enumerate(label_names):
        tn, fp, fn, tp = cm[i].ravel()

        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

        print(f"{label} — FPR: {fpr:.2f}, FNR: {fnr:.2f}, TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}")


--- Logistic Regression ---
Average F1 (micro): 0.6543

--- Random Forest ---
Average F1 (micro): 0.4704
