In [1]:
import sys
from pathlib import Path

project_root = str(Path.cwd().parent)
if project_root not in sys.path:
    sys.path.append(project_root)



In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, classification_report
from src.training_pipeline.train import train_and_evaluate, load_processed_data, load_model_config, get_model, evaluate_model


In [3]:
# Load the processed data
train_features, train_labels, val_features, val_labels = load_processed_data()

print(train_features.head())
print(train_labels.head())

print("Training data shape:", train_features.shape)
print("Training data labels:", train_labels.shape)
print("Validation data shape:", val_features.shape)
print("Validation data labels:", val_labels.shape)


   tfidf_aaa  tfidf_aarp  tfidf_abbott  tfidf_abele  tfidf_ability  \
0        0.0         0.0           0.0          0.0            0.0   
1        0.0         0.0           0.0          0.0            0.0   
2        0.0         0.0           0.0          0.0            0.0   
3        0.0         0.0           0.0          0.0            0.0   
4        0.0         0.0           0.0          0.0            0.0   

   tfidf_able  tfidf_abolish  tfidf_abolishing  tfidf_abortion  \
0         0.0            0.0               0.0             0.0   
1         0.0            0.0               0.0             0.0   
2         0.0            0.0               0.0             0.0   
3         0.0            0.0               0.0             0.0   
4         0.0            0.0               0.0             0.0   

   tfidf_abortions  ...  unions  urban  veterans  voting-record  water  \
0              0.0  ...       0      0         0              0      0   
1              0.0  ...       0   

In [5]:
# Train and evaluate the model

config_path = Path(project_root) / "src/training_pipeline/configs/model_configs.yaml"

configs = load_model_config(config_path)
model = get_model("random_forest", configs["random_forest"])
model.train(train_features, train_labels)
metrics = evaluate_model(model, val_features, val_labels)

# Display metrics
print("Model Metrics:")

for label, label_scores in metrics.items():
    if isinstance(label_scores, dict):
        print(f"{label}:")
        for metric, value in label_scores.items():
            print(f"  {metric}: {value:.4f}")
    else:
        print(f"{label}: {label_scores:.4f}")

Model Metrics:
0:
  precision: 0.0000
  recall: 0.0000
  f1-score: 0.0000
  support: 331.0000
1:
  precision: 0.2208
  recall: 0.1328
  f1-score: 0.1659
  support: 399.0000
2:
  precision: 0.2052
  recall: 0.7991
  f1-score: 0.3266
  support: 423.0000
3:
  precision: 0.2857
  recall: 0.1122
  f1-score: 0.1612
  support: 392.0000
4:
  precision: 1.0000
  recall: 0.0060
  f1-score: 0.0118
  support: 168.0000
5:
  precision: 0.2000
  recall: 0.0030
  f1-score: 0.0059
  support: 335.0000
accuracy: 0.2134
macro avg:
  precision: 0.3186
  recall: 0.1755
  f1-score: 0.1119
  support: 2048.0000
weighted avg:
  precision: 0.2548
  recall: 0.2134
  f1-score: 0.1326
  support: 2048.0000
