In [3]:
import pandas as pd             
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D, Concatenate, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score

In [4]:
# Step 1: Load the dataset
df = pd.read_csv("../data/cleaned_reddit_posts.csv")

In [5]:
# Show how many entries fall into each popularity bucket to understand class balance
print(df["popularity_bucket"].value_counts())

popularity_bucket
high      3415
low       3316
medium    3316
Name: count, dtype: int64


In [6]:
# Step 2: Drop unneeded columns
df = df.drop(columns=["id", "author", "score", "num_comments", "upvote_ratio"])

We combine title and selftext to give the model more text information from the post, improving prediction.

In [7]:
# Step 3: Combine title and selftext
df["text"] = df["title"].fillna('') + " " + df["selftext"].fillna('')

In [8]:
# Step 4: Encode labels (popularity_bucket)
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["popularity_bucket"])
y = to_categorical(df["label"])

In [9]:
# Step 5: Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"])
X_text = pad_sequences(sequences, maxlen=100)

In [10]:
# Step 6: Encode categorical features
cat_features = ["subreddit", "flair", "media_type"]
encoded_features = []

for col in cat_features:
    le = LabelEncoder()
    df[col] = df[col].fillna("unknown")
    encoded = le.fit_transform(df[col])
    encoded_features.append(encoded)

# Add binary features
encoded_features.append(df["is_self"].astype(int))
encoded_features.append(df["nsfw"].astype(int))
encoded_features.append(df["created_hour"].fillna(0).astype(int))

# Final non-text input
X_other = np.stack(encoded_features, axis=1)

# horizontally join features into one array
X_combined = np.hstack([X_text, X_other])  

In [11]:
# Step 7: Build the model
def create_model(dropout_rate=0.3):
    model = Sequential()
    model.add(Input(shape=(X_combined.shape[1],)))  # single combined input
    model.add(Dense(256, activation='relu'))  #128
    model.add(Dropout(dropout_rate))
    model.add(Dense(128, activation='relu'))  #64
    model.add(Dropout(dropout_rate))
    
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(dropout_rate))

    model.add(Dense(3, activation='softmax'))
    
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [14]:
# Compile and train
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

print("Type of X_train:", type(X_train))
print("X_train shape:", X_train.shape)
print("X_train[0] shape:", np.array(X_train[0]).shape)

# Wrap model for GridSearch
model = KerasClassifier(model=create_model, verbose=0)

# Define hyperparameter grid
param_grid = {
    'batch_size': [16, 32],
    'epochs': [20, 30],  #10, 15
    #'dropout_rate': [0.3, 0.5]
    "model__dropout_rate": [0.3, 0.5]
}

# Perform GridSearchCV on training data
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(X_train, y_train)

# Print best hyperparameters
print(f"Best params: {grid_result.best_params_}")
print(f"Best accuracy: {grid_result.best_score_:.4f}")

# Evaluate on unseen test data
y_pred = grid_result.best_estimator_.predict(X_test)

# Convert predictions to class labels if they are probabilities or one-hot
y_pred_labels = np.argmax(y_pred, axis=1)  # add this line

# Convert one-hot y_test to class labels
y_test_labels = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_test_labels, y_pred_labels)
print(f"Test accuracy: {accuracy:.4f}")

Type of X_train: <class 'numpy.ndarray'>
X_train shape: (8037, 106)
X_train[0] shape: (106,)
Best params: {'batch_size': 32, 'epochs': 20, 'model__dropout_rate': 0.3}
Best accuracy: 0.3705
Test accuracy: 0.3234


In [15]:
# Evaluate
# Get the underlying Keras model from the best estimator
best_model = grid_result.best_estimator_.model_

# Evaluate on test data using combined features
loss, accuracy = best_model.evaluate(X_test, y_test, verbose=0)
print(f"Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}")

Test loss: 1.0995, Test accuracy: 0.3234
