# Baseline model (Bag of Words approach)

Naive model using a non-contextual model

In [None]:
import sys
import os

# Get the current directory
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")

# Add the parent directory to the path if needed
parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
sys.path.append(parent_dir)
print(f"Added to path: {parent_dir}")

Current working directory: c:\Users\felix\Documents\GIT\Hertie\PACT-ML\modules
Added to path: c:\Users\felix\Documents\GIT\Hertie\PACT-ML


## Data preparation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

merged_data = pd.merge(
    para_data,
    report_data,
    on=["matchingKey", "paragraphNumber"],
    how="left"
)

merged_data.to_csv("../data/merged_data.csv", index=False)

# Check the shape of the merged data
print(f"Shape of merged data: {merged_data.shape}")

# vectorizer = CountVectorizer(stop_words='english', min_df=5, max_df=0.95)
# X = vectorizer.fit_transform(merged_data["paragraph"])

tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.95)
X_ = tfidf_vectorizer.fit_transform(merged_data['paragraph'])

# df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
# print(f"Number of features for Count Vectorizer: {df_bow.shape[1]}")
print(f"Number of features for TF-IDF: {df_tfidf.shape[1]}")

Shape of merged data: (6029, 12)
Number of features for Count Vectorizer: 6144
Number of features for TF-IDF: 6144


## Iterative Stratification K-fold CV

To make the most of our limited multi-label data, we use Iterative Stratification K-fold CV across all our models.

In [22]:

from skmultilearn.model_selection import IterativeStratification

Y = merged_data[target_categories].fillna(False).astype(int).values

# Set up Iterative Stratification
n_splits = 5
stratifier = IterativeStratification(n_splits=n_splits, order=1)



## Logistic Model + Random Forest using BoW
Now, we can finally calculate our first two models.

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, multilabel_confusion_matrix

best_params_rf = {'estimator__max_depth': None, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 75}

rf_params = {k.replace('estimator__', ''): v for k, v in best_params_rf.items()}

models = {
    "Logistic Regression": OneVsRestClassifier(LogisticRegression(max_iter=1000, C= 1.0, penalty='l1', solver='liblinear')),
    "Balanced Logistic Regression": OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=1000, C= 0.1, penalty='l1', solver='liblinear')),
    "Random Forest": RandomForestClassifier(**rf_params)
}

label_names = merged_data[target_categories].columns.tolist()

results = []

for model_name, model in models.items():
    print(f"\n--- {model_name} ---")

    # Store metrics per fold, per label
    fold_metrics = []

    f1_scores = []

    for train_idx, test_idx in stratifier.split(X, Y):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]

        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)

        Y_pred = Y_pred.toarray() if hasattr(Y_pred, 'toarray') else Y_pred

        f1_micro = f1_score(Y_test, Y_pred, average='micro')  # or 'macro'
        f1_macro = f1_score(Y_test, Y_pred, average='macro')
        f1_scores.append({'f1_micro': f1_micro, 'f1_macro': f1_macro})
        f1_scores_df = pd.DataFrame(f1_scores)

        # Compute confusion matrices per label
        cm = multilabel_confusion_matrix(Y_test, Y_pred)

        rows = []

        # Show false posotives and false negatives per label
        for i, label in enumerate(label_names):
            tn, fp, fn, tp = cm[i].ravel()
            fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
            fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
            tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
            tnr = tn / (tn + fp) if (tn + fp) > 0 else 0

            fold_metrics.append({
                'Model': model_name,
                'Fold': len(fold_metrics) // len(label_names) + 1,
                'Label': label,
                'F1_micro': f1_micro,
                'F1_macro': f1_macro,
                'FPR': fpr,
                'FNR': fnr,
                'TPR': tpr,
                'TNR': tnr,
                'TP': tp,
                'FP': fp,
                'FN': fn,
                'TN': tn
            })

        # Convert per-fold metrics to DataFrame
        fold_metrics_df = pd.DataFrame(fold_metrics)

        # Aggregate over folds: mean per model and label
        summary_df = fold_metrics_df.groupby(['Model', 'Label']).agg({
            'F1_micro': 'mean',
            'F1_macro': 'mean',
            'FPR': 'mean',
            'FNR': 'mean',
            'TPR': 'mean',
            'TNR': 'mean',
        }).reset_index()

        # Append summary to global results list
        results.append(summary_df)

        print(f"Average F1 (micro): {np.mean(f1_scores_df.f1_micro):.4f}")
        print(f"Average F1 (macro): {np.mean(f1_scores_df.f1_macro):.4f}")
        print(classification_report(Y_test, Y_pred, target_names=label_names))


    # Combine all models into one DataFrame
    final_results_df = pd.concat(results, ignore_index=True)

    # Save to CSV
    final_results_df.to_csv('../out/model_performance_summary_tf_idf.csv', index=False)


#### Grid Search for best parameters for logistic regression

In [None]:
from sklearn.model_selection import GridSearchCV

# Logistic Regression Hyperparameter Grid
param_grid_lr = {
    'estimator__C': [0.1, 1.0, 10.0],
    'estimator__penalty': ['l1', 'l2'],
    'estimator__solver': ['liblinear']  # Only solver supporting l1
}

# Model wrapper
base_lr = OneVsRestClassifier(LogisticRegression(max_iter=1000))
balanced_lr = OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=1000))

# For storing metrics
# all_f1_scores = []

# # Grid Search for base LR
# for fold_idx, (train_idx, test_idx) in enumerate(stratifier.split(X, Y)):
#     print(f"\n📂 Fold {fold_idx + 1}/{n_splits}")

#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = Y[train_idx], Y[test_idx]

#     # Grid search
#     grid_search = GridSearchCV(base_lr, param_grid_lr, cv=3, scoring='f1_micro', verbose=1, n_jobs=-1)
#     grid_search.fit(X_train, y_train)

#     best_model = grid_search.best_estimator_
#     print(f"Best params for Base LR: {grid_search.best_params_}")

#     y_pred = best_model.predict(X_test)
#     y_pred = y_pred.toarray() if hasattr(y_pred, "toarray") else y_pred

#     f1 = f1_score(y_test, y_pred, average="micro")
#     all_f1_scores.append(f1)

#     print(f"F1 Micro: {f1:.4f}")
#     print(classification_report(y_test, y_pred, target_names=label_names))

# print(f"\n✅ Average F1 Micro over all folds: {np.mean(all_f1_scores):.4f}")

# # Grid Search for base LR
# for fold_idx, (train_idx, test_idx) in enumerate(stratifier.split(X, Y)):
#     print(f"\n📂 Fold {fold_idx + 1}/{n_splits}")

#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = Y[train_idx], Y[test_idx]

#     # Grid search
#     grid_search = GridSearchCV(balanced_lr, param_grid_lr, cv=3, scoring='f1_micro', verbose=1, n_jobs=-1)
#     grid_search.fit(X_train, y_train)

#     best_model = grid_search.best_estimator_
#     print(f"Best params for Balanced LR: {grid_search.best_params_}")

#     y_pred = best_model.predict(X_test)
#     y_pred = y_pred.toarray() if hasattr(y_pred, "toarray") else y_pred

#     f1 = f1_score(y_test, y_pred, average="micro")
#     all_f1_scores.append(f1)

#     print(f"F1 Micro: {f1:.4f}")
#     print(classification_report(y_test, y_pred, target_names=label_names))

# print(f"\n✅ Average F1 Micro over all folds: {np.mean(all_f1_scores):.4f}")

grid_search_base_lr = GridSearchCV(
    OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    param_grid_lr,
    cv=5,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=1
)

grid_search_base_lr.fit(X, Y)
print("✅ Best parameters (global) for base LR:", grid_search_base_lr.best_params_)

grid_search_balanced_lr = GridSearchCV(
    OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=1000)),
    param_grid_lr,
    cv=5,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=1
)

grid_search_balanced_lr.fit(X, Y)
print("✅ Best parameters (global) for balanced LR:", grid_search_balanced_lr.best_params_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
✅ Best parameters (global) for base LR: {'estimator__C': 1.0, 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
✅ Best parameters (global) for balanced LR: {'estimator__C': 0.1, 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear'}


#### Grid Search for Random Forest Hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

# Random Forest Hyperparameter Grid
param_grid_rf = {
    'estimator__n_estimators': [50, 75, 100, 125, 150, 175, 200],
    'estimator__max_depth': [None, 10, 20, 30, 40, 50],
    'estimator__min_samples_split': [2, 5, 10]
}

# Model wrapper
base_rf = OneVsRestClassifier(RandomForestClassifier())

# # For storing metrics
# all_f1_scores = []

# # Grid Search for base RF
# for fold_idx, (train_idx, test_idx) in enumerate(stratifier.split(X, Y)):
#     print(f"\n📂 Fold {fold_idx + 1}/{n_splits}")

#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = Y[train_idx], Y[test_idx]

#     # Grid search
#     grid_search = GridSearchCV(base_rf, param_grid_rf, cv=3, scoring='f1_micro', verbose=1, n_jobs=-1)
#     grid_search.fit(X_train, y_train)

#     best_model = grid_search.best_estimator_
#     print(f"Best params for Random Forest: {grid_search.best_params_}")

#     y_pred = best_model.predict(X_test)
#     y_pred = y_pred.toarray() if hasattr(y_pred, "toarray") else y_pred

#     f1 = f1_score(y_test, y_pred, average="micro")
#     all_f1_scores.append(f1)

#     print(f"F1 Micro: {f1:.4f}")
#     print(classification_report(y_test, y_pred, target_names=label_names))

# print(f"\n✅ Average F1 Micro over all folds: {np.mean(all_f1_scores):.4f}")

grid_search_rf = GridSearchCV(
    OneVsRestClassifier(RandomForestClassifier()),
    param_grid_rf,
    cv=10,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=1
)

grid_search_rf.fit(X, Y)
print("✅ Best parameters (global) for Random Forest:", grid_search_rf.best_params_)


Fitting 10 folds for each of 126 candidates, totalling 1260 fits
✅ Best parameters (global) for Random Forest: {'estimator__max_depth': None, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 75}
