In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [3]:
df = pd.read_csv('./reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [5]:

# Step 1: Function to run the experiment
def run_experiment(vectorizer_type, ngram_range, vectorizer_max_features, vectorizer_name):
    print(vectorizer_type)
    # Step 2: Vectorization
    if vectorizer_type == "BoW":
        vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)

    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Step 4: Define and train a Random Forest model

    n_estimators = 200
    max_depth = 15



    # Initialize and train the model
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)

    # Step 5: Make predictions and log metrics
    y_pred = model.predict(X_test)

    # Log accuracy
    accuracy = accuracy_score(y_test, y_pred)


    # Log classification report
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                print(f"{label}_{metric}", value)

    # Log confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix: {vectorizer_name}, {ngram_range}")
    plt.savefig("confusion_matrix.png")
    plt.close()

    # Log the model

# Step 6: Run experiments for BoW and TF-IDF with different n-grams
ngram_ranges = [(1, 1), (1, 2), (1, 3)]  # unigrams, bigrams, trigrams
max_features = 5000  # Example max feature size

for ngram_range in ngram_ranges:
    # BoW Experiments
    run_experiment("BoW", ngram_range, max_features, vectorizer_name="BoW")

    # TF-IDF Experiments
    run_experiment("TF-IDF", ngram_range, max_features, vectorizer_name="TF-IDF")


BoW
-1_precision 0.9666666666666667
-1_recall 0.017575757575757574
-1_f1-score 0.034523809523809526
-1_support 1650.0
0_precision 0.6554934823091247
0_recall 0.8351126927639383
0_f1-score 0.7344809598330725
0_support 2529.0
1_precision 0.6329331046312179
1_recall 0.8189600507292327
1_f1-score 0.7140290255701451
1_support 3154.0
macro avg_precision 0.7516977512023364
macro avg_recall 0.5572161670229762
macro avg_f1-score 0.49434459830900906
macro avg_support 7333.0
weighted avg_precision 0.7158071769762223
weighted avg_recall 0.6442111005045684
weighted avg_f1-score 0.5681868511905582
weighted avg_support 7333.0
TF-IDF
-1_precision 0.9705882352941176
-1_recall 0.02
-1_f1-score 0.039192399049881234
-1_support 1650.0
0_precision 0.659238625812442
0_recall 0.8422301304863582
0_f1-score 0.7395833333333334
0_support 2529.0
1_precision 0.6356932153392331
1_recall 0.8199112238427394
1_f1-score 0.7161451121572971
1_support 3154.0
macro avg_precision 0.7551733588152643
macro avg_recall 0.5607137