In [8]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
import joblib

In [16]:
# load processed dataset
df = pd.read_csv("data/processed/cleaned_with_features.csv")
df.columns = df.columns.str.strip()

print("Data shape:", df.shape)
print(df[["shares", "shares_log", "viral"]].head())

Data shape: (39644, 63)
   shares  shares_log  viral
0     593    6.386879      0
1     711    6.568078      0
2    1500    7.313887      0
3    1200    7.090910      0
4     505    6.226537      0


In [17]:
# define features and targets
y_reg = df["shares_log"].values

y_clf = df["viral"].values

drop_cols = ["url", "timedelta", "shares", "shares_log", "viral"]
drop_cols = [c for c in drop_cols if c in df.columns]

X = df.drop(columns=drop_cols)
feature_names = X.columns.tolist()

print("Feature matrix shape:", X.shape)
print("Number of features:", len(feature_names))

Feature matrix shape: (39644, 58)
Number of features: 58


In [18]:
# split the data
X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X, y_reg, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

In [None]:
# MLP Regressor for predicting 'shares_log'

mlp_reg = Pipeline([
    ("scaler", StandardScaler()),
    ("model", MLPRegressor(
        hidden_layer_sizes=(128, 64),
        activation="relu",
        solver="adam",
        alpha=1e-4,
        learning_rate="adaptive",
        max_iter=200,
        random_state=42,
        early_stopping=True,
        n_iter_no_change=10,
        verbose=True
    ))
])

mlp_reg.fit(X_train, y_reg_train)

y_reg_pred = mlp_reg.predict(X_test)

rmse = mean_squared_error(y_reg_test, y_reg_pred) ** 0.5
mae = mean_absolute_error(y_reg_test, y_reg_pred)
r2 = r2_score(y_reg_test, y_reg_pred)

print("\n=== MLP Regressor Performance (shares_log) ===")
print(f"RMSE (test): {rmse:.4f}")
print(f"MAE  (test): {mae:.4f}")
print(f"R^2  (test): {r2:.4f}")

Iteration 1, loss = 3.65188435
Validation score: -0.836118
Iteration 2, loss = 0.63546999
Validation score: -0.286957
Iteration 3, loss = 0.51121791
Validation score: -0.119679
Iteration 4, loss = 0.45618310
Validation score: -0.032001
Iteration 5, loss = 0.42698523
Validation score: 0.001137
Iteration 6, loss = 0.40560068
Validation score: 0.051737
Iteration 7, loss = 0.39192522
Validation score: 0.051127
Iteration 8, loss = 0.38552149
Validation score: 0.047157
Iteration 9, loss = 0.37775162
Validation score: 0.069273
Iteration 10, loss = 0.37157448
Validation score: 0.091421
Iteration 11, loss = 0.36398382
Validation score: 0.083414
Iteration 12, loss = 0.36218659
Validation score: 0.093329
Iteration 13, loss = 0.35796768
Validation score: 0.075254
Iteration 14, loss = 0.35509260
Validation score: 0.044093
Iteration 15, loss = 0.35308360
Validation score: 0.076933
Iteration 16, loss = 0.34828984
Validation score: 0.078252
Iteration 17, loss = 0.34522482
Validation score: 0.061526
It

In [21]:
# MLP Classifier for predicting 'viral'

mlp_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("model", MLPClassifier(
        hidden_layer_sizes=(128, 64),
        activation="relu",
        solver="adam",
        alpha=1e-4,
        learning_rate="adaptive",
        max_iter=200,
        random_state=42,
        early_stopping=True,
        n_iter_no_change=10,
        verbose=True
    ))
])

mlp_clf.fit(X_train, y_clf_train)

y_clf_train_pred = mlp_clf.predict(X_train)
y_clf_test_pred = mlp_clf.predict(X_test)
y_clf_test_proba = mlp_clf.predict_proba(X_test)[:, 1]

def summarize_clf(y_true, y_pred, y_proba, split_name):
    return {
        "Split": split_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1": f1_score(y_true, y_pred),
        "ROC-AUC": roc_auc_score(y_true, y_proba),
    }

results = []
results.append(summarize_clf(
    y_clf_train,
    y_clf_train_pred,
    mlp_clf.predict_proba(X_train)[:, 1],
    "Train"
))
results.append(summarize_clf(
    y_clf_test,
    y_clf_test_pred,
    y_clf_test_proba,
    "Test"
))
 
results_df = pd.DataFrame(results)
print("\n=== MLP Classifier Performance (viral) ===")
print(results_df)


Iteration 1, loss = 0.32733148
Validation score: 0.897856
Iteration 2, loss = 0.30292431
Validation score: 0.898172
Iteration 3, loss = 0.29768588
Validation score: 0.897856
Iteration 4, loss = 0.29254527
Validation score: 0.898487
Iteration 5, loss = 0.28935218
Validation score: 0.897856
Iteration 6, loss = 0.28510620
Validation score: 0.897856
Iteration 7, loss = 0.28107620
Validation score: 0.897226
Iteration 8, loss = 0.27793372
Validation score: 0.894388
Iteration 9, loss = 0.27356435
Validation score: 0.897226
Iteration 10, loss = 0.27002486
Validation score: 0.896280
Iteration 11, loss = 0.26513003
Validation score: 0.896595
Iteration 12, loss = 0.26148511
Validation score: 0.894073
Iteration 13, loss = 0.25780741
Validation score: 0.892182
Iteration 14, loss = 0.25249803
Validation score: 0.893443
Iteration 15, loss = 0.24820385
Validation score: 0.888398
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.

=== MLP Classifier Performance