In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import ldamodel
from tqdm import tqdm
from gensim import corpora

from src.utils.recovery_analysis_utils import str_to_list
from src.utils.find_video_categories_utils import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

decline_events = pd.read_csv('data/sampled_decline_events_with_videos.csv')
videos = pd.read_csv('data/videos_around_declines.csv')

decline_events['Videos_before'] = decline_events['Videos_before'].apply(str_to_list)
decline_events['Videos_after'] = decline_events['Videos_after'].apply(str_to_list)

In [2]:
df_tags = create_tags_dataframe(decline_events, videos)

## Preprocessing the whole dataset

In [15]:
print("Tokenizing and lemmatizing tags")
df_tags['Tokens'] = None
for index, row in tqdm(df_tags.iterrows(), total=df_tags.shape[0]):
    df_tags.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])

Tokenizing and lemmatizing tags


100%|██████████| 61194/61194 [03:29<00:00, 292.74it/s]


In [16]:

# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_tags['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_tags['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=15)
for topic in topics:
    print(topic)

Creating dictionary and corpus
Training LDA model
(0, '0.023*"car" + 0.013*"golf" + 0.010*"train" + 0.009*"asmr" + 0.009*"travel" + 0.007*"crash" + 0.006*"racing" + 0.006*"dubai" + 0.006*"fishing" + 0.006*"bike" + 0.006*"race" + 0.006*"road" + 0.006*"park" + 0.005*"street" + 0.005*"tour"')
(1, '0.043*"gta" + 0.041*"v" + 0.039*"movie" + 0.038*"5" + 0.026*"online" + 0.022*"season" + 0.020*"trailer" + 0.015*"review" + 0.015*"comic" + 0.014*"4" + 0.014*"episode" + 0.012*"film" + 0.011*"6" + 0.010*"marvel" + 0.010*"money"')
(2, '0.076*"news" + 0.031*"2019" + 0.030*"video" + 0.027*"cricket" + 0.023*"hindi" + 0.022*"latest" + 0.017*"song" + 0.017*"2018" + 0.016*"pgt" + 0.016*"indian" + 0.016*"india" + 0.014*"new" + 0.014*"pakistan" + 0.014*"live" + 0.012*"tv"')
(3, '0.063*"pokemon" + 0.021*"android" + 0.018*"free" + 0.017*"go" + 0.016*"best" + 0.013*"iphone" + 0.013*"review" + 0.011*"card" + 0.011*"new" + 0.010*"moon" + 0.010*"sun" + 0.010*"io" + 0.010*"shiny" + 0.008*"pro" + 0.008*"hack"')
(

In [17]:
topics = lda.print_topics(num_words=15)

# Create a DataFrame from the topics
topics_data = []
for topic_id, topic in topics:
    topics_data.append({"Topic": topic_id, "Words": topic})

topics_df = pd.DataFrame(topics_data)

topics_df.to_csv("data/lda_topics.csv", index=False)
print("Topics saved to lda_topics.csv")

Topics saved to lda_topics.csv


In [18]:
print("Assigning topics to each document")

def assign_dominant_topic(tokens, lda_model, dictionary):
    if not tokens or not isinstance(tokens, list):  # Handle empty or invalid tokens
        return None, None
    bow = dictionary.doc2bow(tokens)  # Convert tokens to bag-of-words format
    topic_probs = lda_model.get_document_topics(bow)  # Get topic distribution
    if topic_probs:
        dominant_topic, prob = max(topic_probs, key=lambda x: x[1])  # Most probable topic
        return dominant_topic, prob
    return None, None

df_tags['Dominant_Topic'], df_tags['Topic_Probability'] = zip(
    *df_tags['Tokens'].apply(lambda tokens: assign_dominant_topic(tokens, lda, dictionary))
)

print(df_tags.head(20))
df_tags.to_csv('data/df_small_sample.csv', index=False)

Assigning topics to each document
                                                    Tags_combined  \
Decline Source                                                      
0       After   MsRosieBea\nMsRosieBea,21st birthday,birthday,...   
        Before  MsRosieBea\nMsRosieBea,OUTFIT DIARIES\nMsRosie...   
1       After   hollow,generationhollow,gameplay,review,guide,...   
        Before  hollow,generationhollow,playthrough,blind play...   
2       After                                                None   
        Before                                               None   
3       After   pubg,killstreak,player unknowns battleground,b...   
        Before  Rust,wipe,wipe day,horrible,a horrible wipe,lu...   
4       After   breyerfest,BreyerFest 2019,2019,2018,30th anni...   
        Before  Breyer,BreyerFest,CollectA,American Alligator,...   
5       After   Triple Entray,Phora,Drake,Eminem,Justin Bieber...   
        Before  Triple Entray,Hip hop,Phora,Eminem,King Lil G,...   


In [19]:
df_tags = df_tags.dropna(subset=['Tokens', 'Dominant_Topic'])

# Pivot the dataset
df_pivot = df_tags.pivot_table(
    index='Decline',  
    columns='Source',  
    values=['Tokens', 'Dominant_Topic'],  
    aggfunc={
        'Tokens': lambda x: ' '.join([item for sublist in x for item in sublist]),  # Flatten and join the tokens
        'Dominant_Topic': lambda x: x.mode()[0]  # Get the most frequent dominant topic
    }
)

df_pivot.head(20)

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens
Source,After,Before,After,Before
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,15.0,15.0,"msrosiebea msrosiebea,21st birthday birthday r...",msrosiebea msrosiebea outfit diary msrosiebea ...
1,12.0,12.0,hollow generationhollow gameplay review guide ...,hollow generationhollow playthrough blind play...
3,5.0,5.0,pubg killstreak player unknown battleground ba...,rust wipe wipe day horrible horrible wipe luck...
4,19.0,19.0,"breyerfest breyerfest 2019,2019,2018,30th anni...",breyer breyerfest collecta american alligator ...
5,7.0,7.0,triple entray phora drake eminem justin bieber...,triple entray hip hop phora eminem king lil g ...
7,16.0,0.0,yasha yasha jeltuhin cyr cyr wheel circus akro...,yasha yasha jeltuhin akrosphere circus jen mac...
8,7.0,7.0,free type beat free untagged beat playboi cart...,impulsebeats yung impulse impulse beat impulse...
10,17.0,17.0,pumpkin patch best pumpkin patch baby toddler ...,step wander stepstowander thewanderfamily wand...
11,12.0,12.0,pharmit pharmit24 malaysia awesome gamer youtu...,pharmit pharmit24 malaysia awesome gamer youtu...
12,17.0,17.0,twintalksballet twin twin talk talk ballet bal...,twintalksballet twin talk ballet ballerina dan...


In [20]:
def token_change(tokens_before, tokens_after):
    # Ensure tokens are lists and not NaN or float
    if not isinstance(tokens_before, list):
        tokens_before = []
    if not isinstance(tokens_after, list):
        tokens_after = []
        
    set_before = set(tokens_before)
    set_after = set(tokens_after)
    return set_before != set_after 

df_pivot['Token_Change'] = df_pivot.apply(
    lambda row: token_change(row[('Tokens', 'Before')], row[('Tokens', 'After')]), axis=1)

df_pivot['Topic_Change'] = df_pivot.apply(
    lambda row: row[('Dominant_Topic', 'Before')] != row[('Dominant_Topic', 'After')], axis=1)

df_pivot.head()

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens,Token_Change,Topic_Change
Source,After,Before,After,Before,Unnamed: 5_level_1,Unnamed: 6_level_1
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,15.0,15.0,"msrosiebea msrosiebea,21st birthday birthday r...",msrosiebea msrosiebea outfit diary msrosiebea ...,False,False
1,12.0,12.0,hollow generationhollow gameplay review guide ...,hollow generationhollow playthrough blind play...,False,False
3,5.0,5.0,pubg killstreak player unknown battleground ba...,rust wipe wipe day horrible horrible wipe luck...,False,False
4,19.0,19.0,"breyerfest breyerfest 2019,2019,2018,30th anni...",breyer breyerfest collecta american alligator ...,False,False
5,7.0,7.0,triple entray phora drake eminem justin bieber...,triple entray hip hop phora eminem king lil g ...,False,False


In [21]:
# Creation of a new csv file 
df_topic_change = df_pivot.reset_index()
df_topic_change = df_topic_change[['Decline', 'Topic_Change', 'Dominant_Topic']]
df_topic_change.to_csv('data/df_topic_change_20_15w.csv', index=False)

In [22]:
print(f"{df_topic_change['Topic_Change'].mean() * 100:.2f}% of the channels changed the topic of the videos after the start of the decline.")

34.87% of the channels changed the topic of the videos after the start of the decline.


In [None]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda, texts=df_tags['Tokens'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')
# small df: optimum at 0.7475 with 55 topics, numwords = 9 
# whole df: 0.6525 with 55 topics, 0.5991 with 50, 