In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
# import optuna


In [11]:
df = pd.read_csv('./reddit_preprocessing.csv').dropna()
df.shape

(36662, 2)

In [12]:
# Step 1: Remove rows where the target labels (category) are NaN
df = df.dropna(subset=['category'])

# Step 2: TF-IDF vectorizer setup
ngram_range = (1, 3)  # Trigram
max_features = 1000  # Set max_features to 1000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']

# Step 3: Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Step 5: Train Multinomial Naïve Bayes model
model = MultinomialNB(alpha=1.0)  # Default smoothing parameter
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Step 6: Log accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Step 7: Log classification report
classification_rep = classification_report(y_test, y_pred, output_dict=True)
for label, metrics in classification_rep.items():
    if isinstance(metrics, dict):
        for metric, value in metrics.items():
            print(f"{label}_{metric}", value)

Accuracy: 0.6644472627351511
-1_precision 0.5958605664488017
-1_recall 0.693722257450856
-1_f1-score 0.6410782302959274
-1_support 3154.0
0_precision 0.6997319034852547
0_recall 0.5792644261255548
0_f1-score 0.6338248048568951
0_support 3154.0
1_precision 0.7146901541365209
1_recall 0.7203551046290425
1_f1-score 0.7175114479709458
1_support 3154.0
macro avg_precision 0.6700942080235258
macro avg_recall 0.6644472627351511
macro avg_f1-score 0.6641381610412561
macro avg_support 9462.0
weighted avg_precision 0.6700942080235258
weighted avg_recall 0.6644472627351511
weighted avg_f1-score 0.664138161041256
weighted avg_support 9462.0
