In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import ldamodel
from tqdm import tqdm
from gensim import corpora

from src.utils.recovery_analysis_utils import str_to_list
from src.utils.find_video_categories_utils import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

# Apply LDA to annotate the videos before and after the start of declines with categories

### Load and preprocess the whole dataset

In [None]:
decline_events, videos = load_data()
print(videos)
decline_events = process_data(decline_events)

In [None]:


df_tags = create_tags_dataframe(decline_events, videos)

print(df_tags)

print("Tokenizing and lemmatizing tags")
df_tags['Tokens'] = None
for index, row in tqdm(df_tags.iterrows(), total=df_tags.shape[0]):
    df_tags.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])


### Train the LDA model to get the topics

In [None]:
# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary, corpus = create_dictionary_and_corpus(df_tags) 

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=15)
for topic in topics:
    print(topic)

In [None]:
# Save the topics

topics = lda.print_topics(num_words=15)

# Create a DataFrame from the topics
topics_data = []
for topic_id, topic in topics:
    topics_data.append({"Topic": topic_id, "Words": topic})

topics_df = pd.DataFrame(topics_data)

topics_df.to_csv("data/lda_topics.csv", index=False)
print("Topics saved to lda_topics.csv")

### Assign the topics to the videos before and after

In [None]:
df_tags['Dominant_Topic'], df_tags['Topic_Probability'] = zip(
    *df_tags['Tokens'].apply(lambda tokens: assign_dominant_topic(tokens, lda, dictionary))
)

print(df_tags.head(5))
df_tags.to_csv('data/df_small_sample.csv', index=False)

In [None]:
coherence_lda = calculate_coherence(lda, df_tags, dictionary)
print(f'Coherence Score: {coherence_lda}')
# whole df: 0.6525 with 55 topics, 0.5991 with 50, 