In [None]:
# !pip install pandas
# !pip install tensorflow
# !pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
import pandas as pd             
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D, Concatenate, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [6]:
# Step 1: Load the dataset
df = pd.read_csv("../data/cleaned_reddit_posts.csv")

In [7]:
# Show how many entries fall into each popularity bucket to understand class balance
print(df["popularity_bucket"].value_counts())

popularity_bucket
high      3415
low       3316
medium    3316
Name: count, dtype: int64


In [8]:
# Step 2: Drop unneeded columns
df = df.drop(columns=["id", "author", "score", "num_comments", "upvote_ratio"])

In [9]:
# Step 3: Encode labels (popularity_bucket)
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["popularity_bucket"])
y = to_categorical(df["label"])

In [10]:
# Step 4: Encode categorical features
cat_features = ["subreddit", "flair", "media_type"]
encoded_features = []

for col in cat_features:
    le = LabelEncoder()
    df[col] = df[col].fillna("unknown")
    encoded = le.fit_transform(df[col])
    encoded_features.append(encoded)

# Add binary features
encoded_features.append(df["is_self"].astype(int))
encoded_features.append(df["nsfw"].astype(int))
encoded_features.append(df["created_hour"].fillna(0).astype(int))

# Final non-text input
X = np.stack(encoded_features, axis=1) 

In [11]:
# Step 7: Build the model
def create_model(dropout_rate=0.3):
    model = Sequential()
    model.add(Input(shape=(X.shape[1],)))  
    model.add(Dense(256, activation='relu'))  #128    
    model.add(Dropout(dropout_rate))
    model.add(Dense(128, activation='relu'))  #64   
    model.add(Dropout(dropout_rate))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(dropout_rate))

    model.add(Dense(3, activation='softmax'))
    
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [12]:
# Step 8: Compile and train
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Type of X_train:", type(X_train))
print("X_train shape:", X_train.shape)
print("X_train[0] shape:", np.array(X_train[0]).shape)

# Wrap model for GridSearch
model = KerasClassifier(model=create_model, verbose=0)

# Define hyperparameter grid
param_grid = {
    'batch_size': [16, 32],
    'epochs': [15, 20],  #10
    "model__dropout_rate": [0.3, 0.5]
}

# Perform GridSearchCV on training data
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(X_train, y_train)

# Print best hyperparameters
print(f"Best params: {grid_result.best_params_}")
print(f"Best accuracy: {grid_result.best_score_:.4f}")

# Evaluate on unseen test data
y_pred = grid_result.best_estimator_.predict(X_test)

# Convert predictions to class labels if they are probabilities or one-hot
y_pred_labels = np.argmax(y_pred, axis=1)  

# Convert one-hot y_test to class labels
y_test_labels = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_test_labels, y_pred_labels)
print(f"Test accuracy: {accuracy:.4f}")

Type of X_train: <class 'numpy.ndarray'>
X_train shape: (8037, 6)
X_train[0] shape: (6,)
Best params: {'batch_size': 16, 'epochs': 20, 'model__dropout_rate': 0.3}
Best accuracy: 0.5738
Test accuracy: 0.5617


In [13]:
# Evaluate
# Get the underlying Keras model from the best estimator
best_model = grid_result.best_estimator_.model_

# Evaluate on test data using combined features
loss, accuracy = best_model.evaluate(X_test, y_test, verbose=0)
print(f"Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}")

Test loss: 0.9331, Test accuracy: 0.5617


In [15]:
from sklearn.metrics import classification_report, confusion_matrix

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test_labels, y_pred_labels, target_names=label_encoder.classes_))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred_labels))



Classification Report:
              precision    recall  f1-score   support

        high       0.56      0.72      0.63       650
         low       0.85      0.27      0.41       684
      medium       0.50      0.70      0.58       676

    accuracy                           0.56      2010
   macro avg       0.64      0.56      0.54      2010
weighted avg       0.64      0.56      0.54      2010


Confusion Matrix:
[[469  13 168]
 [187 185 312]
 [182  19 475]]


In [None]:
# Evaluate model on test set
y_pred = grid_result.best_estimator_.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

# Compute metrics
model_accuracy = accuracy_score(y_test_labels, y_pred_labels)
model_precision = precision_score(y_test_labels, y_pred_labels, average='weighted', zero_division=0)
model_recall = recall_score(y_test_labels, y_pred_labels, average='weighted', zero_division=0)
model_f1 = f1_score(y_test_labels, y_pred_labels, average='weighted', zero_division=0)

# Print all metrics
print("\n=== Model Performance Metrics ===")
print(f"Accuracy:  {model_accuracy:.4f}")
print(f"Precision: {model_precision:.4f}")
print(f"Recall:    {model_recall:.4f}")
print(f"F1 Score:  {model_f1:.4f}")



=== Model Performance Metrics ===
Accuracy:  0.5617
Precision: 0.6384
Recall:    0.5617
F1 Score:  0.5395


In [None]:
# Naive Baseline 

# Find the most frequent class in the training set
most_common_class = np.argmax(np.sum(y_train, axis=0))

# Predict that class for all test samples
naive_predictions = np.full(shape=(y_test.shape[0],), fill_value=most_common_class)

# Convert one-hot y_test to class labels
y_test_labels = np.argmax(y_test, axis=1)

# Compute metrics
naive_accuracy = accuracy_score(y_test_labels, naive_predictions)
naive_precision = precision_score(y_test_labels, naive_predictions, average='weighted', zero_division=0)
naive_recall = recall_score(y_test_labels, naive_predictions, average='weighted', zero_division=0)
naive_f1 = f1_score(y_test_labels, naive_predictions, average='weighted', zero_division=0)

print("\n=== Naive Baseline Metrics ===")
print(f"Most Frequent Class: {most_common_class} ({label_encoder.inverse_transform([most_common_class])[0]})")
print(f"Accuracy: {naive_accuracy:.4f}")
print(f"Precision: {naive_precision:.4f}")
print(f"Recall: {naive_recall:.4f}")
print(f"F1 Score: {naive_f1:.4f}")

# Print confusion matrix for naive baseline
print("\nNaive Baseline Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred_naive))



=== Naive Baseline Metrics ===
Most Frequent Class: 0 (high)
Accuracy: 0.3234
Precision: 0.1046
Recall: 0.3234
F1 Score: 0.1580

Naive Baseline Confusion Matrix:
[[650   0   0]
 [684   0   0]
 [676   0   0]]
