In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import ldamodel
from tqdm import tqdm
from gensim import corpora

from src.utils.recovery_analysis_utils import str_to_list
from src.utils.find_video_categories_utils import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Apply LDA to annotate the videos before and after the start of declines with categories

### Load and preprocess the whole dataset

In [6]:
decline_events, videos = load_data()
print(videos)
decline_events = process_data(decline_events)

                          channel  week  \
0        UCzWm1-4XF7AHxVUTkHCM1uw   227   
1        UCzWm1-4XF7AHxVUTkHCM1uw   226   
2        UCzWm1-4XF7AHxVUTkHCM1uw   224   
3        UCzWm1-4XF7AHxVUTkHCM1uw   224   
4        UCzWm1-4XF7AHxVUTkHCM1uw   223   
...                           ...   ...   
1905541  UCrwE8kVqtIUVUzKui2WVpuQ   109   
1905542  UCrwE8kVqtIUVUzKui2WVpuQ   109   
1905543  UCrwE8kVqtIUVUzKui2WVpuQ   109   
1905544  UCrwE8kVqtIUVUzKui2WVpuQ   109   
1905545  UCrwE8kVqtIUVUzKui2WVpuQ   109   

                                                      tags  duration  
0        video,games,retrogamer3,ed,findlay,Scam,Steam,...       384  
1        video,games,retrogamer3,ed,findlay,Trump,Ameri...       270  
2        video,games,retrogamer3,ed,findlay,America's R...       109  
3                               MTG Arena War of the Spark      5154  
4        video,games,retrogamer3,ed,findlay,Mpow,Headph...       475  
...                                                    ..

In [7]:


df_tags = create_tags_dataframe(decline_events, videos)

print(df_tags)

print("Tokenizing and lemmatizing tags")
df_tags['Tokens'] = None
for index, row in tqdm(df_tags.iterrows(), total=df_tags.shape[0]):
    df_tags.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])


                                                    Tags_combined
Decline Source                                                   
0       After   MsRosieBea,uni,uni life,first year of uni,thir...
        Before  MsRosieBea,red lip,get ready with me\nMsRosieB...
1       After   hollow,generationhollow,the surge,surge,robo s...
        Before  hollow,generationhollow,paragon,gameplay,alpha...
2       After                                                None
...                                                           ...
36595   Before  Despacito accordion cover,Fonsi Despacito acco...
36597   After   #patriotattitude,#whenhellfreezesover,Keith Fe...
        Before  Audi,Audi 2.1,Line Bore,Kenax,Line bore Kenax ...
36598   After   Music,beats,instrumental,right beat radio,mell...
        Before  Music,beats,instrumental,right beat radio,mell...

[61194 rows x 1 columns]
Tokenizing and lemmatizing tags


100%|██████████| 61194/61194 [03:13<00:00, 316.42it/s]


### Train the LDA model to get the topics

In [8]:
# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary, corpus = create_dictionary_and_corpus(df_tags) 

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=15)
for topic in topics:
    print(topic)

Creating dictionary and corpus
Training LDA model
(0, '0.033*"makeup" + 0.019*"haul" + 0.015*"hair" + 0.013*"beauty" + 0.011*"fashion" + 0.010*"tutorial" + 0.009*"day" + 0.008*"recipe" + 0.008*"food" + 0.007*"review" + 0.007*"style" + 0.007*"routine" + 0.006*"home" + 0.006*"vegan" + 0.006*"tip"')
(1, '0.025*"funny" + 0.023*"family" + 0.022*"vlog" + 0.020*"video" + 0.012*"prank" + 0.011*"vlogs" + 0.010*"life" + 0.010*"challenge" + 0.009*"girl" + 0.009*"daily" + 0.009*"comedy" + 0.009*"youtube" + 0.007*"baby" + 0.007*"reaction" + 0.006*"vlogger"')
(2, '0.050*"music" + 0.042*"beat" + 0.030*"type" + 0.018*"dance" + 0.016*"free" + 0.015*"trap" + 0.015*"song" + 0.013*"2018" + 0.012*"cover" + 0.012*"rap" + 0.010*"remix" + 0.010*"hop" + 0.010*"new" + 0.009*"hip" + 0.009*"lyric"')
(3, '0.061*"game" + 0.030*"gameplay" + 0.026*"play" + 0.021*"let" + 0.019*"minecraft" + 0.018*"\'s" + 0.013*"gaming" + 0.012*"walkthrough" + 0.011*"mod" + 0.010*"part" + 0.010*"pc" + 0.009*"video" + 0.009*"ark" + 0.00

In [9]:
# Save the topics

topics = lda.print_topics(num_words=15)

# Create a DataFrame from the topics
topics_data = []
for topic_id, topic in topics:
    topics_data.append({"Topic": topic_id, "Words": topic})

topics_df = pd.DataFrame(topics_data)

topics_df.to_csv("data/lda_topics.csv", index=False)
print("Topics saved to lda_topics.csv")

Topics saved to lda_topics.csv


### Assign the topics to the videos before and after

In [10]:
df_tags['Dominant_Topic'], df_tags['Topic_Probability'] = zip(
    *df_tags['Tokens'].apply(lambda tokens: assign_dominant_topic(tokens, lda, dictionary))
)

print(df_tags.head(5))
df_tags.to_csv('data/df_small_sample.csv', index=False)

                                                    Tags_combined  \
Decline Source                                                      
0       After   MsRosieBea,uni,uni life,first year of uni,thir...   
        Before  MsRosieBea,red lip,get ready with me\nMsRosieB...   
1       After   hollow,generationhollow,the surge,surge,robo s...   
        Before  hollow,generationhollow,paragon,gameplay,alpha...   
2       After                                                None   

                                                           Tokens  \
Decline Source                                                      
0       After   [msrosiebea, uni, uni, life, first, year, uni,...   
        Before  [msrosiebea, red, lip, get, ready, me\nmsrosie...   
1       After   [hollow, generationhollow, surge, surge, robo,...   
        Before  [hollow, generationhollow, paragon, gameplay, ...   
2       After                                                  []   

                Dominant_Topic  

In [11]:
coherence_lda = calculate_coherence(lda, df_tags, dictionary)
print(f'Coherence Score: {coherence_lda}')
# whole df: 0.6525 with 55 topics, 0.5991 with 50, 

Coherence Score: 0.5815071105454963
