# Final integrated notebook: Predicting Virality of Online News Articles

This notebook is a single, sequential pipeline that implements Steps 1–8 from project planning: data cleaning, feature engineering, text features, EDA, baseline & ensemble models, clustering, neural models, scenario simulations, and a final predict API. It writes artifacts into `data/processed/`, `models/`, and `figures/` and is intended to be run top-to-bottom. Heavy tasks (Word2Vec, RNN) are optional and disabled by default.

TL;DR: Run the cells in order. Use the `run_all(skip_heavy=True)` cell at the end to run a fast pipeline.

In [None]:
# Imports and environment setup
import os
import sys
import time
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler

# Optional imports (graceful)
try:
    import xgboost as xgb
except Exception:
    xgb = None

try:
    import shap
except Exception:
    shap = None

try:
    import gensim
    from gensim.models import Word2Vec
except Exception:
    gensim = None

try:
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    vader_available = True
except Exception:
    vader_available = False

try:
    import tensorflow as tf
    from tensorflow import keras
except Exception:
    tf = None

print('Loaded core libraries. Optional packages: xgboost={}, shap={}, gensim={}, vader={}'.format(bool(xgb), bool(shap), bool(gensim), vader_available))
plt.style.use('seaborn-v0_8')

# Basic paths
PROJECT_ROOT = Path('.').resolve()
DATA_RAW = PROJECT_ROOT / 'data' / 'raw'
DATA_PROCESSED = PROJECT_ROOT / 'data' / 'processed'
FIGURES = PROJECT_ROOT / 'figures'
MODELS_DIR = PROJECT_ROOT / 'models'
for p in [DATA_PROCESSED, FIGURES, MODELS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

SEED = 42
np.random.seed(SEED)


In [None]:
# Helper utilities
import json

def set_seed(seed=42):
    np.random.seed(seed)


def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    # strip, lower, replace spaces with underscores
    df = df.copy()
    df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
    return df


def ensure_dirs():
    for p in [DATA_PROCESSED, FIGURES, MODELS_DIR]:
        p.mkdir(parents=True, exist_ok=True)


def save_df(df, path: Path):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False)
    print('Saved:', path)


def load_df_if_exists(path: Path):
    p = Path(path)
    if p.exists():
        return pd.read_csv(p)
    return None

print('Helper utilities defined')

## Step 1 — Data loading, typing, deduplication, and saving master CSV

This section loads the raw CSV, normalizes columns, drops duplicates, inspects types and missingness, and writes `data/processed/cleaned_base.csv` and `data/processed/cleaned_with_features.csv` (after feature engineering cell below).

In [None]:
# Step 1: load and clean

def load_and_clean(raw_path=DATA_RAW / 'OnlineNewsPopularity.csv'):
    if not raw_path.exists():
        raise FileNotFoundError(f"Raw data not found: {raw_path}")

    df = pd.read_csv(raw_path)
    print('Initial shape:', df.shape)

    df = normalize_columns(df)
    print('Columns normalized')

    # quick dtype info and missing
    print(df.dtypes.value_counts())
    print('Total missing values:', df.isna().sum().sum())

    before = len(df)
    df = df.drop_duplicates()
    after = len(df)
    print(f'Dropped {before-after} duplicate rows')

    # canonicalize shares column if leading space exists
    if 'shares' not in df.columns and ' shares' in df.columns:
        df = df.rename(columns={' shares': 'shares'})

    assert 'shares' in df.columns, 'shares column not found after normalization'

    # Save cleaned base before feature engineering
    save_df(df, DATA_PROCESSED / 'cleaned_base.csv')
    return df

# Run step 1 load
try:
    df_raw = load_and_clean()
except FileNotFoundError as e:
    print(e)
    df_raw = None


## Step 2 — Core feature engineering (shares_log, viral, lengths, keywords, date parts)

Define deterministic feature transformers and write `cleaned_with_features.csv`. Also include simple unit-style checks (assertions) that can be later converted to pytest tests.

In [None]:
# Feature engineering functions

def add_core_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # ensure canonical shares
    if 'shares' in df.columns:
        df['shares'] = pd.to_numeric(df['shares'], errors='coerce').fillna(0).astype(int)
    else:
        raise KeyError('shares column missing')

    df['shares_log'] = np.log1p(df['shares'])

    # viral = top 10%
    threshold = df['shares'].quantile(0.90)
    df['viral'] = (df['shares'] >= threshold).astype(int)

    # headline lengths
    if 'title' in df.columns:
        df['headline_char_len'] = df['title'].astype(str).map(len)
        df['headline_word_len'] = df['title'].astype(str).str.split().map(len)
    else:
        # try common alternate column names
        for c in ['headline', 'n_tokens_title']:
            if c in df.columns:
                df['headline_char_len'] = df[c].astype(str).map(len)
                df['headline_word_len'] = df[c].astype(str).str.split().map(len)
                break

    # article length
    if 'n_tokens_content' in df.columns:
        df['article_length'] = df['n_tokens_content']

    # keywords
    if 'num_keywords' in df.columns:
        df['keyword_count'] = df['num_keywords']
    # keyword density
    if 'keyword_count' in df.columns and 'article_length' in df.columns:
        denom = df['article_length'].replace({0: np.nan})
        df['keyword_density'] = df['keyword_count'] / denom

    # weekday flags from possible one-hot columns
    weekday_cols = [c for c in df.columns if c.startswith('weekday_is_')]
    if weekday_cols:
        df['weekday'] = df[weekday_cols].idxmax(axis=1).str.replace('weekday_is_', '')
        df['is_weekend'] = df['weekday'].isin(['saturday', 'sunday']).astype(int)

    # basic assertions
    assert 'shares_log' in df.columns
    assert 'viral' in df.columns

    return df

# Run feature engineering if df_raw is present
if df_raw is not None:
    df_feat = add_core_features(df_raw)
    save_df(df_feat, DATA_PROCESSED / 'cleaned_with_features.csv')
else:
    df_feat = None
    print('Skipping feature engineering because raw data not loaded')

## Step 2b — Text processing pipeline: TF-IDF, PCA, sentiment, optional Word2Vec

This cell builds a TF-IDF -> PCA pipeline for headlines, computes sentiment scores (VADER/TextBlob fallback), and optionally trains a Word2Vec model to produce headline-average embeddings. Artifacts (vectorizer, pca, optional w2v) are saved under `models/`.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Configurable flags
TRAIN_WORD2VEC = False  # set True to train W2V (slow)
W2V_SIZE = 100
TFIDF_MAX_FEATURES = 10000
PCA_COMPONENTS = 30


def build_text_features(df, headline_col='title'):
    df = df.copy()
    if headline_col not in df.columns:
        # try alternate
        if 'headline' in df.columns:
            headline_col = 'headline'
        else:
            print('No headline column found; skipping text features')
            return df

    texts = df[headline_col].astype(str).fillna('')

    # TF-IDF
    tf = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=(1,2), stop_words='english')
    X_tfidf = tf.fit_transform(texts)
    # save vectorizer
    import joblib
    joblib.dump(tf, MODELS_DIR / 'tfidf_vectorizer.joblib')
    print('Saved TF-IDF vectorizer')

    # PCA on TF-IDF to reduce dimensionality
    n_comp = min(PCA_COMPONENTS, X_tfidf.shape[1])
    pca = PCA(n_components=n_comp, random_state=SEED)
    X_pca = pca.fit_transform(X_tfidf.toarray())
    joblib.dump(pca, MODELS_DIR / 'tfidf_pca.joblib')
    print('Saved TF-IDF PCA')

    # attach PCA columns
    for i in range(X_pca.shape[1]):
        df[f'headline_pc_{i+1}'] = X_pca[:, i]

    # Sentiment
    if vader_available:
        sia = SentimentIntensityAnalyzer()
        sent = texts.map(lambda t: sia.polarity_scores(t))
        sent_df = pd.DataFrame(list(sent))
        sent_df.columns = [f'sent_{c}' for c in sent_df.columns]
        df = pd.concat([df, sent_df.reset_index(drop=True)], axis=1)
    else:
        try:
            from textblob import TextBlob
            sent_df = texts.map(lambda t: pd.Series(TextBlob(t).sentiment._asdict()))
            sent_df.columns = ['sent_polarity', 'sent_subjectivity']
            df = pd.concat([df, sent_df.reset_index(drop=True)], axis=1)
        except Exception:
            print('No sentiment package available; skipping sentiment features')

    # Word2Vec optional
    if TRAIN_WORD2VEC and gensim is not None:
        tokenized = [str(t).split() for t in texts]
        w2v = Word2Vec(sentences=tokenized, vector_size=W2V_SIZE, window=5, min_count=1, workers=2, seed=SEED)
        joblib.dump(w2v, MODELS_DIR / 'word2vec_headlines.joblib')
        # average embeddings per headline
        emb_matrix = []
        for tokens in tokenized:
            vecs = [w2v.wv[t] for t in tokens if t in w2v.wv]
            if len(vecs) == 0:
                emb = np.zeros(W2V_SIZE)
            else:
                emb = np.mean(vecs, axis=0)
            emb_matrix.append(emb)
        emb_matrix = np.vstack(emb_matrix)
        # attach first 10 embedding dimensions as example
        for i in range(min(10, emb_matrix.shape[1])):
            df[f'head_emb_{i+1}'] = emb_matrix[:, i]

    # Save enriched dataset
    save_df(df, DATA_PROCESSED / 'features_complete.csv')
    print('Text features built and saved to features_complete.csv')
    return df

# Build text features if base features exist
if df_feat is not None:
    df_text = build_text_features(df_feat)
else:
    df_text = None


## Step 3 — Exploratory Data Analysis (EDA)

Produce quick plots and summaries used for interpretation and reporting. Saved outputs go to `figures/` and quick summary to `EDA_summary_output.txt`.

In [None]:
def run_eda(df, out_dir=FIGURES):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    with open(out_dir / 'EDA_summary_output.txt', 'w') as f:
        f.write('EDA Summary\n')

    # Distribution of shares raw and log
    plt.figure(figsize=(8,4))
    sns.histplot(df['shares'], bins=80)
    plt.title('Distribution of shares (raw)')
    plt.savefig(out_dir / 'shares_raw.png')
    plt.close()

    plt.figure(figsize=(8,4))
    sns.histplot(df['shares_log'], bins=80)
    plt.title('Distribution of shares (log)')
    plt.savefig(out_dir / 'shares_log.png')
    plt.close()

    # Top correlations with shares_log
    corr = df.corr(numeric_only=True)['shares_log'].abs().sort_values(ascending=False)
    top = corr.head(15).index
    plt.figure(figsize=(10,8))
    sns.heatmap(df[top].corr(), cmap='coolwarm')
    plt.title('Top feature correlation heatmap')
    plt.savefig(out_dir / 'top_corr_heatmap.png')
    plt.close()

    print('EDA completed; figures saved to', out_dir)

# run EDA if features are present
if df_text is not None:
    run_eda(df_text, FIGURES)
else:
    print('Skipping EDA: features not built')

## Step 4 — Baseline models with CV (Linear, Ridge, Lasso; optional Logistic for `viral`)

This section contains utilities to train and evaluate baseline regressors/classifiers using 5-fold CV and saves results to `baseline_model_results.csv`.

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, roc_auc_score, accuracy_score

METRIC_RESULTS = []

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


def evaluate_cv_regressors(X, y, cv=5):
    models = {'Linear': LinearRegression(), 'Ridge': Ridge(), 'Lasso': Lasso()}
    results = []
    kf = KFold(n_splits=cv, shuffle=True, random_state=SEED)
    for name, m in models.items():
        scores = cross_validate(m, X, y, cv=kf, scoring=('neg_mean_squared_error', 'r2'))
        rmse_mean = np.sqrt(-scores['test_neg_mean_squared_error'].mean())
        r2_mean = scores['test_r2'].mean()
        results.append({'model': name, 'rmse': rmse_mean, 'r2': r2_mean})
    res_df = pd.DataFrame(results)
    res_df.to_csv('baseline_model_results.csv', index=False)
    print('Saved baseline_model_results.csv')
    return res_df

# Quick feature selection: numeric cols except target
if df_text is not None:
    numeric_cols = df_text.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c not in ['shares', 'shares_log', 'viral']]
    X = df_text[numeric_cols].fillna(0)
    y = df_text['shares_log']
    baseline_results = evaluate_cv_regressors(X, y)
else:
    baseline_results = None
    print('Skipping baseline models: features not built')

## Step 5 — Ensemble models (RandomForest, optional XGBoost) and explainability

Train RandomForest with a small grid and XGBoost if available. Save best model(s) to `models/` and compute feature importances. If SHAP is available, compute summary plot.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import joblib

if df_text is not None:
    # small grid
    rf = RandomForestRegressor(random_state=SEED)
    param_grid = {'n_estimators': [100, 200], 'max_depth': [6, 12]}
    g = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=2)
    g.fit(X, y)
    print('Best RF params:', g.best_params_)
    best_rf = g.best_estimator_
    joblib.dump(best_rf, MODELS_DIR / 'best_random_forest.joblib')
    print('Saved best_random_forest.joblib')

    # XGBoost optional
    if xgb is not None:
        xgb_model = xgb.XGBRegressor(random_state=SEED, verbosity=0)
        xgb_model.fit(X, y)
        joblib.dump(xgb_model, MODELS_DIR / 'xgb_model.joblib')
        print('Saved xgb_model.joblib')

    # Feature importances
    try:
        importances = pd.Series(best_rf.feature_importances_, index=X.columns).sort_values(ascending=False)
        print(importances.head(20))
    except Exception:
        print('Could not compute feature importances')

    # SHAP explainability (optional)
    if shap is not None:
        explainer = shap.Explainer(best_rf.predict, X)
        shap_values = explainer(X)
        shap.summary_plot(shap_values, X, show=False)
        plt.savefig(FIGURES / 'shap_summary.png')
        plt.close()
        joblib.dump(explainer, MODELS_DIR / 'shap_explainer.joblib')
        print('Saved SHAP explainer and plot')
else:
    print('Skipping ensemble training: features not built')

## Step 6 — Clustering & cluster-feature integration (KMeans + optional advanced)

Compute KMeans over a range of k, choose best by silhouette, save cluster labels as `cluster_kmeans` and write `features_with_clusters.csv` and cluster metrics.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

if df_text is not None:
    # use numeric + headline PCs
    cluster_features = [c for c in df_text.columns if (c.startswith('headline_pc_') or df_text[c].dtype.kind in 'fi')]
    cluster_features = [c for c in cluster_features if c not in ['shares','shares_log','viral']]
    X_cluster = df_text[cluster_features].fillna(0)

    sil_scores = {}
    inertias = {}
    for k in range(2, 11):
        km = KMeans(n_clusters=k, random_state=SEED, n_init=10)
        labels = km.fit_predict(X_cluster)
        sil = silhouette_score(X_cluster, labels)
        sil_scores[k] = sil
        inertias[k] = km.inertia_
        print(f'k={k}: silhouette={sil:.4f}')

    best_k = max(sil_scores, key=sil_scores.get)
    print('Best k by silhouette:', best_k)
    km_final = KMeans(n_clusters=best_k, random_state=SEED, n_init=10)
    df_text['cluster_kmeans'] = km_final.fit_predict(X_cluster)
    save_df(df_text, DATA_PROCESSED / 'features_with_clusters.csv')
    pd.DataFrame({'k': list(sil_scores.keys()), 'silhouette': list(sil_scores.values()), 'inertia': list(inertias.values())}).to_csv(DATA_PROCESSED / 'cluster_metrics.csv', index=False)
    print('Saved clustering outputs')
else:
    print('Skipping clustering: features not built')

## Step 7 — Neural models (MLP regressor; optional LSTM headline classifier)

Train a small MLP regressor using Keras for `shares_log`. The LSTM section is optional and will be skipped by default.

In [None]:
if tf is not None and df_text is not None:
    from tensorflow.keras import layers, models, callbacks
    # Simple MLP
    features = [c for c in df_text.select_dtypes(include=[np.number]).columns if c not in ['shares','shares_log','viral']]
    X_mlp = df_text[features].fillna(0).values
    y_mlp = df_text['shares_log'].values

    scaler = StandardScaler()
    X_mlp_s = scaler.fit_transform(X_mlp)
    joblib.dump(scaler, MODELS_DIR / 'mlp_scaler.joblib')

    model = models.Sequential([
        layers.Input(shape=(X_mlp_s.shape[1],)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    es = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    history = model.fit(X_mlp_s, y_mlp, validation_split=0.1, epochs=50, batch_size=64, callbacks=[es], verbose=1)
    model.save(MODELS_DIR / 'mlp_regressor.keras')
    print('Saved MLP regressor')
else:
    print('Skipping neural models: tensorflow missing or features not built')

## Step 8 — Final prediction API and scenario simulations

Define `predict_from_dict()` which loads saved preprocessors and model artifacts and returns predicted shares and viral probability. Also include a few scenario simulations that tweak headline or metadata and show predicted deltas.

In [None]:
def load_artifacts():
    """Load commonly used model and data artifacts. Returns a dict of artifacts."""
    artifacts = {}
    import joblib
    # load small artifacts if present
    if (MODELS_DIR / 'tfidf_vectorizer.joblib').exists():
        artifacts['tfidf'] = joblib.load(MODELS_DIR / 'tfidf_vectorizer.joblib')
    if (MODELS_DIR / 'tfidf_pca.joblib').exists():
        artifacts['pca'] = joblib.load(MODELS_DIR / 'tfidf_pca.joblib')
    if (MODELS_DIR / 'best_random_forest.joblib').exists():
        artifacts['rf'] = joblib.load(MODELS_DIR / 'best_random_forest.joblib')
    if (MODELS_DIR / 'mlp_scaler.joblib').exists():
        artifacts['mlp_scaler'] = joblib.load(MODELS_DIR / 'mlp_scaler.joblib')
    # load dataset used for thresholds and population plots (if available)
    if (DATA_PROCESSED / 'features_complete.csv').exists():
        try:
            artifacts['features_complete'] = pd.read_csv(DATA_PROCESSED / 'features_complete.csv')
        except Exception:
            artifacts['features_complete'] = None
    return artifacts

artifacts = load_artifacts()

def _add_basic_features(df_rec):
    df_rec = df_rec.copy()
    if 'title' in df_rec.columns:
        df_rec['headline_char_len'] = df_rec['title'].astype(str).map(len)
        df_rec['headline_word_len'] = df_rec['title'].astype(str).str.split().map(lambda x: len(x) if isinstance(x, list) else 0)
    if 'n_tokens_content' in df_rec.columns:
        df_rec['article_length'] = df_rec['n_tokens_content']
    if 'num_keywords' in df_rec.columns:
        df_rec['keyword_count'] = df_rec['num_keywords']
    if 'keyword_count' in df_rec.columns and 'article_length' in df_rec.columns:
        denom = df_rec['article_length'].replace({0: np.nan})
        df_rec['keyword_density'] = df_rec['keyword_count'] / denom
    return df_rec

def predict_from_dict(record: dict, artifacts=artifacts, return_raw=False):
    """Predict shares and viral probability for a single record (dict).
    If RF is available it is used first. If not, falls back to the MLP model if present."""
    df_rec = pd.DataFrame([record])
    df_rec = normalize_columns(df_rec)
    df_rec = _add_basic_features(df_rec)

    # text features: tfidf + pca -> headline_pc_*
    if 'tfidf' in artifacts and 'pca' in artifacts and 'title' in df_rec.columns:
        try:
            X_t = artifacts['tfidf'].transform(df_rec['title'].astype(str))
            X_p = artifacts['pca'].transform(X_t.toarray())
            for i in range(X_p.shape[1]):
                df_rec[f'headline_pc_{i+1}'] = X_p[:, i]
        except Exception as e:
            print('Warning: could not compute tfidf/pca for record:', e)

    # prefer RF model
    if 'rf' in artifacts:
        model = artifacts['rf']
        if hasattr(model, 'feature_names_in_'):
            features = list(model.feature_names_in_)
        else:
            features = [c for c in df_rec.select_dtypes(include=[np.number]).columns]
        # ensure all features present
        for f in features:
            if f not in df_rec.columns:
                df_rec[f] = 0
        X_rec = df_rec[features].fillna(0)
        pred_log = model.predict(X_rec)[0]
        pred_shares = float(np.expm1(pred_log))
        # compute viral prob using dataset quantile if available
        viral_prob = 0.0
        if 'features_complete' in artifacts and artifacts['features_complete'] is not None:
            try:
                th = artifacts['features_complete']['shares'].quantile(0.90)
                viral_prob = float(pred_shares >= th)
            except Exception:
                viral_prob = 0.0
        out = {'pred_shares': pred_shares, 'pred_viral_prob': viral_prob}
        if return_raw:
            out['pred_log'] = float(pred_log)
        return out

    # fallback to MLP regressor
    elif 'mlp_scaler' in artifacts and (MODELS_DIR / 'mlp_regressor.keras').exists():
        scaler = artifacts['mlp_scaler']
        try:
            model = tf.keras.models.load_model(MODELS_DIR / 'mlp_regressor.keras')
        except Exception as e:
            raise RuntimeError('Could not load MLP model: ' + str(e))
        features = [c for c in df_rec.select_dtypes(include=[np.number]).columns]
        for f in features:
            if f not in df_rec.columns:
                df_rec[f] = 0
        X_rec = df_rec[features].fillna(0)
        Xs = scaler.transform(X_rec)
        pred_log = model.predict(Xs)[0,0]
        pred_shares = float(np.expm1(pred_log))
        return {'pred_shares': pred_shares, 'pred_viral_prob': 0.0, 'pred_log': float(pred_log)}
    else:
        raise RuntimeError('No model artifacts available; please train an ensemble or MLP first')

# Dataset-level diagnostics and visualization helpers
def plot_predictions_vs_actual(artifacts=artifacts, sample_n=1000):
    """If dataset and model exist, predict across the dataset and create plots: predicted vs actual, residuals, and predicted distribution."""
    if 'features_complete' not in artifacts or artifacts['features_complete'] is None:
        print('features_complete not available; skipping dataset plots')
        return
    df = artifacts['features_complete'].copy()
    # choose numeric features matching model
    if 'rf' in artifacts:
        model = artifacts['rf']
        if hasattr(model, 'feature_names_in_'):
            features = list(model.feature_names_in_)
        else:
            features = [c for c in df.select_dtypes(include=[np.number]).columns if c not in ['shares','shares_log','viral']]
        df_f = df[features].fillna(0)
        preds_log = model.predict(df_f)
        preds = np.expm1(preds_log)
        df['pred_shares'] = preds
        # scatter predicted vs actual (log scale)
        plt.figure(figsize=(7,5))
        sns.scatterplot(x=np.log1p(df['shares']), y=np.log1p(df['pred_shares']), alpha=0.6)
        plt.xlabel('Actual shares (log1p)')
        plt.ylabel('Predicted shares (log1p)')
        plt.title('Predicted vs Actual (log1p)')
        plt.savefig(FIGURES / 'pred_vs_actual_log.png')
        plt.close()
        # residuals histogram
        df['residual'] = np.log1p(df['pred_shares']) - np.log1p(df['shares'])
        plt.figure(figsize=(7,4))
        sns.histplot(df['residual'], bins=60)
        plt.title('Residuals (pred - actual) on log scale')
        plt.savefig(FIGURES / 'residuals_log.png')
        plt.close()
        print('Saved dataset prediction plots to', FIGURES)
    else:
        print('RF model not available; skipping dataset-level prediction plots')

def run_scenario_simulations(base_record: dict, artifacts=artifacts, out_fig=FIGURES / 'scenario_simulations.png'):
    """Run a few deterministic scenarios (longer headline, more keywords, shorter content, all-caps) and plot predicted shares."""
    variants = []
    base = base_record.copy()
    variants.append(('base', base))
    # longer headline
    long_h = base.copy()
    if 'title' in long_h:
        long_h['title'] = long_h['title'] + ' ' + 'world'*10
    variants.append(('long_headline', long_h))
    # more keywords
    more_k = base.copy()
    more_k['num_keywords'] = int(base.get('num_keywords', 0)) + 10
    variants.append(('more_keywords', more_k))
    # shorter content
    short_c = base.copy()
    short_c['n_tokens_content'] = max(10, int(base.get('n_tokens_content', 100))//10)
    variants.append(('short_content', short_c))
    # all caps
    caps = base.copy()
    if 'title' in caps:
        caps['title'] = caps['title'].upper()
    variants.append(('all_caps', caps))

    results = []
    for name, rec in variants:
        try:
            out = predict_from_dict(rec, artifacts=artifacts, return_raw=True)
            results.append({'scenario': name, 'pred_shares': out.get('pred_shares', None), 'pred_log': out.get('pred_log', None)})
        except Exception as e:
            results.append({'scenario': name, 'pred_shares': None, 'error': str(e)})
    res_df = pd.DataFrame(results).set_index('scenario')
    # plot
    plt.figure(figsize=(8,5))
    sns.barplot(x=res_df.index, y=res_df['pred_shares'].fillna(0))
    plt.ylabel('Predicted shares')
    plt.title('Scenario simulation: predicted shares by variant')
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.savefig(out_fig)
    plt.close()
    print('Saved scenario simulation figure to', out_fig)
    return res_df

# Example simulation (improved) and dataset plots if possible
example = {'title': 'Breaking: Local team wins 3-1 in thrilling match', 'n_tokens_title': 7, 'n_tokens_content': 200, 'num_keywords': 5}
try:
    print('Example prediction:', predict_from_dict(example))
    run_scenario_simulations(example)
    plot_predictions_vs_actual()
except Exception as e:
    print('Prediction or plotting failed:', e)

## Diagnostics: holes and missing artifacts (programmatic)

This cell programmatically checks for files we expect and prints a compact 'holes' list with suggestions.

In [None]:
def check_holes():
    expected = {
        'raw_csv': DATA_RAW / 'OnlineNewsPopularity.csv',
        'cleaned_with_features': DATA_PROCESSED / 'cleaned_with_features.csv',
        'features_complete': DATA_PROCESSED / 'features_complete.csv',
        'tfidf_vectorizer': MODELS_DIR / 'tfidf_vectorizer.joblib',
        'tfidf_pca': MODELS_DIR / 'tfidf_pca.joblib',
        'best_rf': MODELS_DIR / 'best_random_forest.joblib',
        'mlp_model': MODELS_DIR / 'mlp_regressor.keras'
    }
    missing = {k: str(p) for k,p in expected.items() if not p.exists()}
    if missing:
        print('Missing artifacts:')
        for k,v in missing.items():
            print(f' - {k}: {v}')
    else:
        print('All expected artifacts present')
    # quick suggestions
    if (MODELS_DIR / 'best_random_forest.joblib').exists() and not (MODELS_DIR / 'shap_explainer.joblib').exists():
        print('Suggestion: run SHAP explainability (SHAP not found)')

check_holes()

## Final notes and next steps

- This notebook provides a linear, runnable pipeline. Use `TRAIN_WORD2VEC=True` to enable Word2Vec (slow) and set `TRAIN_RNN=True` if you want to train an LSTM/GRU (requires GPU for speed).
- Missing artifacts are listed in the Diagnostics cell; typical quick fixes: run Step 1 & Step 2, then Step 4–5.
- Recommended next work items: normalize column names in source scripts, pin `requirements.txt`, add unit tests for the transformer functions.

Thank you — run the notebook top-to-bottom in a fresh environment after installing packages in `requirements.txt`. Use `skip_heavy=True` in the `run_all` cell to avoid long-running steps during quick demos.