In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import optuna
import tensorflow as tf

In [3]:
# Load and preprocess data
df = pd.read_csv("cleaned_reddit_posts.csv")
df = df.drop(columns=["id", "author", "score", "num_comments", "upvote_ratio"])

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["popularity_bucket"])
y = to_categorical(df["label"])

cat_features = ["subreddit", "flair", "media_type"]
encoded_features = []

for col in cat_features:
    le = LabelEncoder()
    df[col] = df[col].fillna("unknown")
    encoded = le.fit_transform(df[col])
    encoded_features.append(encoded)

encoded_features.append(df["is_self"].astype(int))
encoded_features.append(df["nsfw"].astype(int))
encoded_features.append(df["created_hour"].fillna(0).astype(int))

X = np.stack(encoded_features, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define objective function for Optuna
def objective(trial):
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.6)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    epochs = trial.suggest_int("epochs", 10, 30)

    model = Sequential()
    model.add(Input(shape=(X.shape[1],)))
    model.add(Dense(256, activation="relu"))
    model.add(Dropout(dropout_rate))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(dropout_rate))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(dropout_rate))
    model.add(Dense(3, activation="softmax"))

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

    history = model.fit(
        X_train,
        y_train,
        validation_split=0.2,
        epochs=epochs,
        batch_size=batch_size,
        verbose=0,
    )

    val_accuracy = history.history["val_accuracy"][-1]
    return val_accuracy

# Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, timeout=1800)  # 30 trials or 30 minutes max

print("Best trial:")
trial = study.best_trial
print(f"  Value (Val Accuracy): {trial.value:.4f}")
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Train final model with best hyperparameters
best_params = trial.params
final_model = Sequential()
final_model.add(Input(shape=(X.shape[1],)))
final_model.add(Dense(256, activation="relu"))
final_model.add(Dropout(best_params["dropout_rate"]))
final_model.add(Dense(128, activation="relu"))
final_model.add(Dropout(best_params["dropout_rate"]))
final_model.add(Dense(64, activation="relu"))
final_model.add(Dropout(best_params["dropout_rate"]))
final_model.add(Dense(3, activation="softmax"))

optimizer = Adam(learning_rate=best_params["learning_rate"])
final_model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

final_model.fit(X_train, y_train, epochs=best_params["epochs"], batch_size=best_params["batch_size"], verbose=1)

# Evaluate on test data
loss, accuracy = final_model.evaluate(X_test, y_test, verbose=0)
print(f"Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}")

y_pred_prob = final_model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

[I 2025-07-14 19:50:57,702] A new study created in memory with name: no-name-8bd9e31e-428f-4fc9-b6c3-11b8be87e5a9
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
[I 2025-07-14 19:51:05,767] Trial 0 finished with value: 0.40920397639274597 and parameters: {'dropout_rate': 0.21152494892517817, 'learning_rate': 5.274321028813891e-05, 'batch_size': 32, 'epochs': 22}. Best is trial 0 with value: 0.40920397639274597.
[W 2025-07-14 19:51:08,531] Trial 1 failed with parameters: {'dropout_rate': 0.38105110193805003, 'learning_rate': 0.0008391145076777541, 'batch_size': 64, 'epochs': 22} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\h_ale\anaconda3\envs\cv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\h_ale\AppData\Local\Temp\ipykernel_28724\2175779265.py", line 46, in objective
    history = model.fit(
   

KeyboardInterrupt: 