# 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import os
import joblib # For saving/loading sklearn models
from time import time
import warnings

# Scikit-learn imports
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC # Generally preferred for text over SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import lightgbm as lgb # LightGBM

# Ignore warnings for cleaner output
warnings.filterwarnings('ignore')

# 2. Configuration

In [None]:
# --- Paths ---
DATA_DIR = "../data/processed"
MODEL_SAVE_DIR = "../models/ml"
RESULTS_SAVE_DIR = "../results"
RESULTS_CSV_FILE = os.path.join(RESULTS_SAVE_DIR, "ml_results_summary.csv") # Specific filename

# --- Experiment Setup ---
DOMAINS = ["book_reviews", "financial_news"] # Add your domain folder names

# --- ML Models ---
# Use LinearSVC for SVM with text data - often faster and performs well
# Increased max_iter for Logistic Regression and LinearSVC for convergence
ML_MODELS = {
    "NaiveBayes": MultinomialNB(),
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000),
    "SVM": LinearSVC(random_state=42, max_iter=2000, dual="auto"), # dual="auto" handles sparse/dense cases
    "RandomForest": RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1), # n_jobs=-1 uses all cores
    "LightGBM": lgb.LGBMClassifier(random_state=42, n_jobs=-1)
}

# --- Feature Extractor Settings ---
# Primary: TF-IDF with Uni+Bigrams
TFIDF_CONFIG = {
    "vectorizer": TfidfVectorizer(
        max_features=20000, # Adjust based on memory/performance
        ngram_range=(1, 2), # Unigrams and Bigrams
        sublinear_tf=True # Apply sublinear tf scaling (often helps)
    ),
    "name": "TFIDF_UniBiGram"
}

# Secondary: BoW (Counts) with Unigrams (Primarily for Naive Bayes comparison)
BOW_CONFIG = {
    "vectorizer": CountVectorizer(
        max_features=20000, # Match TFIDF for comparison if desired, or adjust
        ngram_range=(1, 1) # Unigrams only for standard BoW/NB
    ),
    "name": "BoW_UniGram"
}

# --- Reproducibility ---
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Create directories if they don't exist
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
os.makedirs(RESULTS_SAVE_DIR, exist_ok=True)
for domain in DOMAINS:
    os.makedirs(os.path.join(MODEL_SAVE_DIR, domain), exist_ok=True)
    os.makedirs(os.path.join(RESULTS_SAVE_DIR, domain), exist_ok=True)


# 3. Helper Functions (Load Data, Extract Features, Train/Evaluate)

In [None]:
def load_data(domain_name):
    print(f"\nLoading data for domain: {domain_name}...")
    try:
        # We primarily need train and test for this ML pipeline
        train_path = os.path.join(DATA_DIR, domain_name, "train.csv")
        test_path = os.path.join(DATA_DIR, domain_name, "test.csv")

        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        # Basic validation
        if 'text' not in train_df.columns or 'label' not in train_df.columns:
            raise ValueError("Missing 'text' or 'label' column in train data")
        if 'text' not in test_df.columns or 'label' not in test_df.columns:
            raise ValueError("Missing 'text' or 'label' column in test data")

        # Handle potential NaN values in text (important!)
        train_df['text'].fillna('', inplace=True)
        test_df['text'].fillna('', inplace=True)

        print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
        return train_df['text'], train_df['label'], test_df['text'], test_df['label']
    except FileNotFoundError as e:
        print(f"Error loading data for {domain_name}: {e}")
        return None, None, None, None

def extract_features(vectorizer_instance, train_texts, test_texts):
    """Fits vectorizer on training data and transforms train/test data."""
    print(f"Extracting features using {vectorizer_instance.__class__.__name__}...")
    t0 = time()
    X_train = vectorizer_instance.fit_transform(train_texts)
    X_test = vectorizer_instance.transform(test_texts)
    print(f"Feature extraction done in {time() - t0:.2f}s")
    print(f"Train features shape: {X_train.shape}, Test features shape: {X_test.shape}")
    return X_train, X_test, vectorizer_instance # Return fitted vectorizer

def train_evaluate_model(model_name, model_instance, X_train, y_train, X_test, y_test):
    """Trains a model and evaluates it on the test set."""
    print(f"Training {model_name}...")
    t0 = time()
    model_instance.fit(X_train, y_train)
    train_time = time() - t0
    print(f"Training done in {train_time:.2f}s")

    print(f"Evaluating {model_name}...")
    t0 = time()
    y_pred = model_instance.predict(X_test)
    eval_time = time() - t0
    print(f"Evaluation done in {eval_time:.2f}s")

    accuracy = accuracy_score(y_test, y_pred)
    # Use classification_report for detailed metrics
    report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    results = {
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision (Macro)": report_dict['macro avg']['precision'],
        "Recall (Macro)": report_dict['macro avg']['recall'],
        "F1 (Macro)": report_dict['macro avg']['f1-score'],
        "Precision (Weighted)": report_dict['weighted avg']['precision'],
        "Recall (Weighted)": report_dict['weighted avg']['recall'],
        "F1 (Weighted)": report_dict['weighted avg']['f1-score'],
        "Train Time (s)": train_time,
        "Eval Time (s)": eval_time
    }

    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0)) # Print text report too

    return results, model_instance # Return results dict and trained model

# 4. Main Experiment Loop

In [None]:
all_results_list = [] # Store results dictionaries here

In [None]:
for domain in DOMAINS:
    X_train_text, y_train, X_test_text, y_test = load_data(domain)
    if X_train_text is None:
        print(f"Skipping domain {domain} due to data loading error.")
        continue

    # --- Experiment with TF-IDF (Uni+Bigram) ---
    print(f"\n--- Domain: {domain} | Feature Extractor: {TFIDF_CONFIG['name']} ---")
    tfidf_vectorizer_instance = TFIDF_CONFIG["vectorizer"]
    X_train_tfidf, X_test_tfidf, fitted_tfidf_vectorizer = extract_features(
        tfidf_vectorizer_instance, X_train_text, X_test_text
    )

    # Save the fitted TF-IDF vectorizer
    vec_save_path = os.path.join(MODEL_SAVE_DIR, domain, f"vectorizer_{TFIDF_CONFIG['name']}.joblib")
    joblib.dump(fitted_tfidf_vectorizer, vec_save_path)
    print(f"Saved TF-IDF vectorizer to {vec_save_path}")

    # Run models compatible with TF-IDF
    for model_name, model_instance in ML_MODELS.items():
        # Naive Bayes typically prefers counts, but we run it on TF-IDF for comparison as requested
        # if model_name == "NaiveBayes":
        #     print(f"\n--- Running Model: {model_name} (on TF-IDF for comparison) ---")
        # elif model_name != "NaiveBayes": # Run other models on TF-IDF
        print(f"\n--- Running Model: {model_name} (on TF-IDF) ---")
        # else:
        #     continue # Skip NB here, will run with BoW later

        # Get a fresh model instance for each run
        current_model_instance = joblib.load(joblib.dump(model_instance, 'temp_model.joblib')) # Quick way to clone

        model_results, trained_model = train_evaluate_model(
            model_name, current_model_instance, X_train_tfidf, y_train, X_test_tfidf, y_test
        )

        # Store results
        model_results["Domain"] = domain
        model_results["Feature Extractor"] = TFIDF_CONFIG['name']
        all_results_list.append(model_results)

        # Save the trained model
        model_save_path = os.path.join(MODEL_SAVE_DIR, domain, f"{model_name.lower()}_{TFIDF_CONFIG['name']}.joblib")
        joblib.dump(trained_model, model_save_path)
        print(f"Saved model to {model_save_path}")

    # --- Experiment with BoW (UniGram) - Primarily for Naive Bayes ---
    print(f"\n--- Domain: {domain} | Feature Extractor: {BOW_CONFIG['name']} ---")
    bow_vectorizer_instance = BOW_CONFIG["vectorizer"]
    X_train_bow, X_test_bow, fitted_bow_vectorizer = extract_features(
        bow_vectorizer_instance, X_train_text, X_test_text
    )

    # Save the fitted BoW vectorizer
    vec_save_path = os.path.join(MODEL_SAVE_DIR, domain, f"vectorizer_{BOW_CONFIG['name']}.joblib")
    joblib.dump(fitted_bow_vectorizer, vec_save_path)
    print(f"Saved BoW vectorizer to {vec_save_path}")

    # Run Naive Bayes on BoW
    model_name = "NaiveBayes"
    if model_name in ML_MODELS:
        print(f"\n--- Running Model: {model_name} (on BoW) ---")
        model_instance = ML_MODELS[model_name]
        current_model_instance = joblib.load(joblib.dump(model_instance, 'temp_model.joblib')) # Clone

        model_results, trained_model = train_evaluate_model(
            model_name, current_model_instance, X_train_bow, y_train, X_test_bow, y_test
        )

        # Store results
        model_results["Domain"] = domain
        model_results["Feature Extractor"] = BOW_CONFIG['name']
        all_results_list.append(model_results)

        # Save the trained model
        model_save_path = os.path.join(MODEL_SAVE_DIR, domain, f"{model_name.lower()}_{BOW_CONFIG['name']}.joblib")
        joblib.dump(trained_model, model_save_path)
        print(f"Saved model to {model_save_path}")
    else:
        print(f"{model_name} not found in ML_MODELS dictionary.")

# Clean up temporary file used for cloning
if os.path.exists('temp_model.joblib'):
    os.remove('temp_model.joblib')

# 5. Aggregate and Save mResults

In [None]:
print("\n--- Experiment Finished ---")
if all_results_list:
    results_df = pd.DataFrame(all_results_list)
    # Define desired column order
    cols_order = ["Domain", "Feature Extractor", "Model", "Accuracy",
                  "F1 (Macro)", "Precision (Macro)", "Recall (Macro)",
                  "F1 (Weighted)", "Precision (Weighted)", "Recall (Weighted)",
                  "Train Time (s)", "Eval Time (s)"]
    # Ensure all columns exist, add missing ones with NaN if necessary
    for col in cols_order:
        if col not in results_df.columns:
            results_df[col] = np.nan
    results_df = results_df[cols_order] # Reorder

    print("\nAggregated Results:")
    print(results_df.to_string()) # Print full dataframe

    # Save to CSV
    results_df.to_csv(RESULTS_CSV_FILE, index=False)
    print(f"\nResults saved to {RESULTS_CSV_FILE}")
else:
    print("No results were generated.")