In [9]:
# -----------------------------
# Imports
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import resample
import ast

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("pseudo_labels.csv")
df['all_labels'] = df['all_labels'].apply(ast.literal_eval)

# Filter out classes with <2 samples to avoid stratify errors
value_counts = df['most_common_label'].value_counts()
valid_labels = value_counts[value_counts > 1].index
df = df[df['most_common_label'].isin(valid_labels)].reset_index(drop=True)

# Align all_labels with target space (remove noise)
valid_label_list = df['most_common_label'].unique().tolist()
df['all_labels'] = df['all_labels'].apply(lambda lst: [l for l in lst if l in valid_label_list])

# -----------------------------
# 2. Features and target
# -----------------------------
X = df[['review', 'agreement_ratio', 'all_labels']]
y = df['most_common_label']

# -----------------------------
# 3. Balance dataset (oversample)
# -----------------------------
df_balanced = pd.concat([
    resample(group, 
             replace=True, 
             n_samples=value_counts.max(), 
             random_state=42)
    for _, group in df.groupby('most_common_label')
])
X = df_balanced[['review', 'agreement_ratio', 'all_labels']]
y = df_balanced['most_common_label']

# -----------------------------
# 4. Custom Transformer for MultiLabelBinarizer
# -----------------------------
class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()
    def fit(self, X, y=None):
        self.mlb.fit(X)
        return self
    def transform(self, X):
        return self.mlb.transform(X)

# -----------------------------
# 5. Preprocessing / Feature Engineering
# -----------------------------
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1,2)   # <-- Try unigrams + bigrams
)

# Helper to extract numeric column
def get_column(col_name):
    return FunctionTransformer(lambda x: np.array(x[col_name]).reshape(-1, 1), validate=False)

# Build feature union
combined_features = FeatureUnion([
    ('tfidf', Pipeline([
        ('selector', FunctionTransformer(lambda x: x['review'], validate=False)),
        ('tfidf', tfidf_vectorizer)
    ])),
    ('agreement_ratio', Pipeline([
        ('selector', get_column('agreement_ratio'))
    ])),
    ('all_labels', Pipeline([
        ('selector', FunctionTransformer(lambda x: x['all_labels'], validate=False)),
        ('mlb', MultiLabelBinarizerTransformer())
    ]))
])

# -----------------------------
# 6. Train/test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 7A. Logistic Regression Pipeline
# -----------------------------
logreg_pipeline = Pipeline([
    ('features', combined_features),
    ('logreg', LogisticRegression(
        max_iter=3000,
        class_weight='balanced',
        solver='liblinear'
    ))
])

logreg_pipeline.fit(X_train, y_train)
y_pred_log = logreg_pipeline.predict(X_test)

print("=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Macro F1:", f1_score(y_test, y_pred_log, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))

# -----------------------------
# 7B. Random Forest Pipeline
# -----------------------------
rf_pipeline = Pipeline([
    ('features', combined_features),
    ('rf', RandomForestClassifier(
        n_estimators=300,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

print("\n=== Random Forest ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Macro F1:", f1_score(y_test, y_pred_rf, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))



=== Logistic Regression ===
Accuracy: 0.9892241379310345
Macro F1: 0.9892445138346778

Classification Report:
               precision    recall  f1-score   support

        rant       1.00      1.00      1.00       154
 trustworthy       0.97      1.00      0.98       155
   uncertain       1.00      0.97      0.98       155

    accuracy                           0.99       464
   macro avg       0.99      0.99      0.99       464
weighted avg       0.99      0.99      0.99       464


=== Random Forest ===
Accuracy: 1.0
Macro F1: 1.0

Classification Report:
               precision    recall  f1-score   support

        rant       1.00      1.00      1.00       154
 trustworthy       1.00      1.00      1.00       155
   uncertain       1.00      1.00      1.00       155

    accuracy                           1.00       464
   macro avg       1.00      1.00      1.00       464
weighted avg       1.00      1.00      1.00       464



