
# Neural Network Models - Topics 🧠

This notebook uses simple (non-transformer) neural network models to train the topic classifier

#### Notebook Properties
* Upstream Notebook: `src.engineering.topic_processor`
* Compute Resources: `32 GB RAM, 1 GPU` (maybe?)
* Last Updated: `Dec 10 2023`

#### Data

| **Name** | **Type** | **Location Type** | **Description** | **Location** | 
| --- | --- | --- | --- | --- | 
| `all_the_news` | `input` | `Delta` | Read full delta dataset of `AllTheNews` | `catalog/simple_topic/all_the_news.delta` | 

In [0]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import json
import re
from datetime import datetime
import os
from typing import Any, Callable
from loguru import logger
import random

from deltalake import DeltaTable
from tqdm.autonotebook import tqdm
from src.utils.io import FileSystemHandler
from src.utils.functions import all_stopwords

import nltk

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional,Conv1D,GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

In [0]:
# Check for GPU availability
gpus = tf.config.experimental.list_physical_devices('GPU')

# Setup strategy based on the available device: GPU or CPU
if gpus:
    try:
        # If GPUs are available, use MirroredStrategy for distributed training
        tf.config.experimental.set_memory_growth(gpus[0], True)  # Optional: Enable memory growth
        strategy = tf.distribute.MirroredStrategy()
        print("Running on GPU:", gpus[0])
    except RuntimeError as e:
        print(e)
else:
    # If no GPUs are available, use the default strategy that works on CPU and single GPU
    strategy = tf.distribute.get_strategy()
    print("Running on CPU")

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [0]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_info_rows", 10_000_000)

pd.options.plotting.backend = "plotly"

tqdm.pandas()
nltk.download("wordnet")
nltk.download('omw-1.4')
datafs = FileSystemHandler("s3")

In [0]:
LIMIT_PARTITIONS: int | None = None
"""An input parameter to limit the number of table partitions to read from delta. Useful to perform EDA on a sample of data."""

SHUFFLE_PARTITIONS: bool = False
"""Whether to randomize the partitions before reading"""

INPUT_TABLE: str = "all_the_news" 
INPUT_CATALOG: str = "simple_topic"


### Read Data

In [0]:
atn_delta_table: DeltaTable = datafs.read_delta(
    table=INPUT_TABLE,
    catalog_name=INPUT_CATALOG,
    as_pandas=False,
)

df: pd.DataFrame = datafs.read_delta_partitions(
    delta_table=atn_delta_table,
    N_partitions=LIMIT_PARTITIONS,
    shuffle_partitions=SHUFFLE_PARTITIONS,
)

df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(by=["date"])

df = df[["date", "publication", "author", "title", "article", "section", "simple_topic"]]

print(df.shape)
df.head()

In [0]:
df.info()


### Basic Preprocessing

* Filtering Rows to a Single Year
* Cleaning up stopwords, lemmatization, case normalization and other tweaks to articles and titles

In [0]:
df_y = df[df.date.dt.year == 2019]
# df_y = df.copy()
print(df_y.shape)

In [0]:
lemmatizer = nltk.stem.WordNetLemmatizer()


def preprocess_text(x: str) -> str:
    x = x.lower()
    x = re.sub(r"\W", " ", x)
    x = re.sub(r"\s+", " ", x)
    y = x.split()
    y = [word for word in y if word not in all_stopwords]
    y = [lemmatizer.lemmatize(word) for word in y]
    return " ".join(y)

In [0]:
df_y["title_clean"] = df_y["title"].dropna().progress_apply(preprocess_text)
df_y[["title", "title_clean"]].sample(5)

In [0]:
df_y["article_clean"] = df_y["article"].dropna().progress_apply(preprocess_text)
df_y[["article", "article_clean"]].sample(5)

In [0]:
topic_to_id = {topic: id for id, topic in enumerate(df["simple_topic"].unique())}
id_to_topic = {id: topic for topic, id in topic_to_id.items()}

df_y["simple_topic_id"] = df_y["simple_topic"].map(topic_to_id)
df_y[["simple_topic", "simple_topic_id"]].sample(5)

In [0]:
df_y = df_y.dropna(subset=["article_clean"]).dropna(subset=["simple_topic_id"])
print(df_y.shape)

In [0]:
BASE_DIR_NAME: str = "experiment_results"
NOTEBOOK_DIR_NAME: str = "nn_models"

In [0]:
def train_bidirectional_lstm(
    df: pd.DataFrame,
    input_col: str,
    target_col: str,
    sample_size: int | None = None,
    target_col_inverse_mapping: dict | None = None,
    max_len: int = 32,
    num_words: int = 10_000,
    embedding_output_dim: int = 128,
    lstm_allowed_units: list[int] = [32, 64, 128],
    learning_rate: float = 0.001,
    batch_size: int = 32,
    epochs: int = 5,
):
    title_topic_dir_name: str = (
        f"./{BASE_DIR_NAME}"
        + f"/{NOTEBOOK_DIR_NAME}"
        + f"/bid_lstm_article"
        + f"/{datetime.utcnow().strftime('%Y%m%d-%H%M')}"
    )
    os.makedirs(title_topic_dir_name, exist_ok=True)

    sample_size = sample_size if sample_size else len(df)
    sample_df: pd.DataFrame = df.sample(sample_size)

    input_values: np.ndarray = sample_df[input_col].values
    target_values: np.ndarray = sample_df[target_col].values

    tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(input_values)

    word_index = tokenizer.word_index
    sequences = tokenizer.texts_to_sequences(input_values)
    padded_sequences = pad_sequences(sequences, padding="post", maxlen=max_len)

    X_train, X_test, y_train, y_test = train_test_split(
        padded_sequences,
        target_values,
        test_size=0.2,
        random_state=50,
    )

    lstm_1_units: int = random.choice(lstm_allowed_units)
    lstm_2_units: int = random.choice(lstm_allowed_units)
    dense_units: int = random.choice(lstm_allowed_units)

    # lstm_1_units: int = 128
    # lstm_2_units: int = 64
    # dense_units: int = 32

    model = Sequential(
        [
            Embedding(
                input_dim=num_words,
                output_dim=embedding_output_dim,
                input_length=max_len,
            ),
            Bidirectional(LSTM(units=lstm_1_units, return_sequences=True)),
            Bidirectional(LSTM(units=lstm_2_units)),
            Dense(dense_units, activation="relu"),
            Dropout(0.5),
            Dense(len(np.unique(target_values)), activation="softmax"),
        ]
    )

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        metrics=["accuracy"],
    )

    model_history = model.fit(
        X_train,
        y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_test, y_test),
    )

    y_pred = np.argmax(model.predict(X_test), axis=1)

    target_classification_report = classification_report(
        y_test,
        y_pred,
        target_names=[
            target_col_inverse_mapping[i]
            for i in sorted(set(y_test) | set(y_pred))
            if i in target_col_inverse_mapping
        ],
    )

    with open(f"{title_topic_dir_name}/classification_report.txt", "w") as f:
        f.write(target_classification_report)

    output_params = dict(
        max_len=max_len,
        num_words=num_words,
        embedding_output_dim=embedding_output_dim,
        learning_rate=learning_rate,
        batch_size=batch_size,
        epochs=epochs,
        lstm_1_units=lstm_1_units,
        lstm_2_units=lstm_2_units,
        dense_units=dense_units,
    )

    with open(f"{title_topic_dir_name}/hyperparameters.json", "w") as f:
        json.dump(output_params, f, indent=4)

In [0]:
train_bidirectional_lstm(
    df=df_y,
    input_col="article_clean",
    target_col="simple_topic_id",
    target_col_inverse_mapping=id_to_topic,
)

In [0]:
def train_convolutional_neural_network(
    df: pd.DataFrame,
    input_col: str,
    target_col: str,
    sample_size: int | None = None,
    target_col_inverse_mapping: dict | None = None,
    max_len: int = 20,
    num_words: int = 10_000,
    embedding_output_dim: int = 64,
    filter_sizes: list[int] = [64, 128],
    kernel_size: int = 5,
    learning_rate: float = 0.0001,
    batch_size: int = 64,
    epochs: int = 5,
):
    title_topic_dir_name: str = (
        f"./{BASE_DIR_NAME}"
        + f"/{NOTEBOOK_DIR_NAME}"
        + f"/cnn"
        + f"/{datetime.utcnow().strftime('%Y%m%d-%H%M')}"
    )
    os.makedirs(title_topic_dir_name, exist_ok=True)

    sample_size = sample_size if sample_size else len(df)
    sample_df: pd.DataFrame = df.sample(sample_size)

    input_values: np.ndarray = sample_df[input_col].values
    target_values: np.ndarray = sample_df[target_col].values

    tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(input_values)

    sequences = tokenizer.texts_to_sequences(input_values)
    padded_sequences = pad_sequences(sequences, padding="post", maxlen=max_len)

    X_train, X_test, y_train, y_test = train_test_split(
        padded_sequences,
        target_values,
        test_size=0.2,
        random_state=50,
    )

    model = Sequential(
        [
            Embedding(
                input_dim=num_words,
                output_dim=embedding_output_dim,
                input_length=max_len,
            ),
            Conv1D(
                filters=random.choice(filter_sizes),
                kernel_size=kernel_size,
                activation="relu",
            ),
            GlobalMaxPooling1D(),
            Dense(64, activation="relu"),
            Dropout(0.5),
            Dense(len(np.unique(target_values)), activation="softmax"),
        ]
    )

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        metrics=["accuracy"],
    )

    model_history = model.fit(
        X_train,
        y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_test, y_test),
    )

    y_pred = np.argmax(model.predict(X_test), axis=1)

    target_classification_report = classification_report(
        y_test,
        y_pred,
        target_names=[
            target_col_inverse_mapping[i]
            for i in sorted(set(y_test) | set(y_pred))
            if i in target_col_inverse_mapping
        ],
    )

    with open(f"{title_topic_dir_name}/classification_report.txt", "w") as f:
        f.write(target_classification_report)

    output_params = dict(
        max_len=max_len,
        num_words=num_words,
        embedding_output_dim=embedding_output_dim,
        filter_sizes=filter_sizes,
        kernel_size=kernel_size,
        learning_rate=learning_rate,
        batch_size=batch_size,
        epochs=epochs,
    )

    with open(f"{title_topic_dir_name}/hyperparameters.json", "w") as f:
        json.dump(output_params, f, indent=4)

In [0]:
train_convolutional_neural_network(
    df=df_y,
    input_col="article_clean",
    target_col="simple_topic_id",
    target_col_inverse_mapping=id_to_topic,
)