
# Latent Dirichlet Allocation (LDA) Modeling 📊

This notebook explores a sample of the data to try different topic modeling approaches

#### Notebook Properties
* Upstream Notebook: `src.engineering.word_counts_and_sentiments`
* Compute Resources: `32 GB RAM, 4 CPUs`
* Last Updated: `Nov 28 2023`

#### Data

| **Name** | **Type** | **Location Type** | **Description** | **Location** | 
| --- | --- | --- | --- | --- | 
| `all_the_news_wc_sentiment` | `input` | `Delta` | WC & Sentiment assigned `AllTheNews` data | `catalog/text_eda/all_the_news.delta` | 

In [0]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import plotly.express as px
from tqdm.autonotebook import tqdm

from gensim import corpora, models
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import nltk

from deltalake import DeltaTable
from src.utils.io import FileSystemHandler

In [0]:
pd.set_option("display.max_columns", None)
pd.options.plotting.backend = "plotly"
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tqdm.pandas()

datafs = FileSystemHandler("s3")

In [0]:
LIMIT_PARTITIONS: int | None = None
"""An input parameter to limit the number of table partitions to read from delta. Useful to perform EDA on a sample of data."""

SHUFFLE_PARTITIONS: bool = False
"""Whether to randomize the partitions before reading"""

INPUT_TABLE: str = "all_the_news" 
INPUT_CATALOG: str = "text_eda"


### Read Data

In [0]:
atn_delta_table: DeltaTable = datafs.read_delta(
    table=INPUT_TABLE,
    catalog_name=INPUT_CATALOG,
    as_pandas=False,
)

df: pd.DataFrame = datafs.read_delta_partitions(
    delta_table=atn_delta_table,
    N_partitions=LIMIT_PARTITIONS,
    shuffle_partitions=SHUFFLE_PARTITIONS,
)

df["date"] = pd.to_datetime(df["date"])
df = df[df.date < pd.to_datetime("2020-04-01")]
df = df.sort_values(by=["date"])

print(df.shape)
df.head()

In [0]:
sample_df = df[(df.year == 2019) & (df.month == 6)]
print(sample_df.shape)
sample_df.head()

In [0]:
def preprocess_tokenize(text: str) -> list[str]:
    """Converts a text string into a set of tokens and removes stopwords and other characters."""
    return [
        token for token in simple_preprocess(text) if token not in stop_words
    ]


sample_df["title_pt"] = sample_df["title"].dropna().apply(preprocess_tokenize)
sample_df["article_pt"] = (
    sample_df["article"].dropna().progress_apply(preprocess_tokenize)
)

In [0]:
sample_df.head()

In [0]:
sample_df["article_pt"].explode().value_counts().head(10)

In [0]:
sample_df['title_pt'].explode().value_counts().head(10)

In [0]:
"""Dictionary corpora for LDA"""
# Create a dictionary representation of the documents
dictionary: corpora.Dictionary = corpora.Dictionary(sample_df["article_pt"].dropna())

#no_below: the token appears in at least these many articles in the data
#no_above: Remove tokens that appear in more than x% of documents
#keep_n: 
dictionary.filter_extremes(no_below=10, no_above=0.1, keep_n=100_000)

# Create a corpus: a list of bag of words for each document
corpus: list[tuple[int, int]] = [
    dictionary.doc2bow(doc) for doc in sample_df["article_pt"].dropna()
]

In [0]:
num_topics = 50

lda_model = models.LdaMulticore(
    corpus,
    num_topics=num_topics,
    id2word=dictionary,
    passes=2,
    workers=2,
)

In [0]:
topic_words = pd.DataFrame()

for i in range(num_topics):
    tt = lda_model.get_topic_terms(i, 50)
    topic_words[str(i)] = [dictionary[pair[0]] for pair in tt]

topic_words.head()

In [0]:
# Function to get the top topic for each document
def format_topics_sentences(
    ldamodel=None, corpus=corpus, texts=sample_df["article"]
) -> pd.DataFrame:
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in tqdm(enumerate(ldamodel[corpus])):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]),
                    ignore_index=True,
                )
            else:
                break

    sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df


df_topic_sents_keywords = format_topics_sentences(
    ldamodel=lda_model, corpus=corpus, texts=sample_df["article_pt"]
)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = [
    "Document_No",
    "Dominant_Topic",
    "Topic_Perc_Contrib",
    "Keywords",
    "Text",
]

# Show
df_dominant_topic.head()