In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import ldamodel
from tqdm import tqdm
from gensim import corpora
from src.utils.recovery_analysis_utils import str_to_list

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

decline_events = pd.read_csv('data/sampled_decline_events_with_videos.csv')
videos = pd.read_csv('data/videos_around_declines.csv')

decline_events['Videos_before'] = decline_events['Videos_before'].apply(str_to_list)
decline_events['Videos_after'] = decline_events['Videos_after'].apply(str_to_list)

In [2]:
# Create a data_frame with 2 index: the index of the decline and the source (before and after)

df_before = decline_events[['Videos_before']].explode('Videos_before')
df_before['Source'] = 'Before'
df_before = df_before.rename(columns={'Videos_before': 'Video'})

df_after = decline_events[['Videos_after']].explode('Videos_after')
df_after['Source'] = 'After'
df_after = df_after.rename(columns={'Videos_after': 'Video'})

df_tags = pd.concat([df_before, df_after], axis=0).reset_index().rename(columns={'index': 'Decline'})
df_tags = df_tags.set_index(['Decline', 'Source'])

df_tags.sort_values(by = ['Decline', 'Source'])
df_tags = df_tags.dropna()

In [3]:
# Map to obtain the tags of all videos for each video before and after decline
df_tags['Tags'] = df_tags['Video'].map(lambda video: videos.loc[video, 'tags'] if video in videos.index else None)
df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Video,Tags
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Before,1684989,MsRosieBea
0,Before,1684990,"MsRosieBea,primark haul,primark haul august,pr..."
0,Before,1684991,MsRosieBea
0,Before,1684992,MsRosieBea
0,Before,1684993,"MsRosieBea,red lip,get ready with me"
...,...,...,...
36598,After,1889699,"Music,beats,instrumental,right beat radio,stra..."
36598,After,1889700,"Music,beats,instrumental,right beat radio,late..."
36598,After,1889701,"Music,beats,instrumental,right beat radio,lofi..."
36598,After,1889702,"Music,beats,instrumental,right beat radio,mell..."


In [4]:
# Get for each decline only 2 rows with the tags corresponding to the before and the after, handling NaNs and non-list values
df_tags = df_tags.groupby(['Decline', 'Source'])['Tags'].apply(
    lambda x: list(set([item for sublist in x.dropna() for item in (sublist if isinstance(sublist, list) else [sublist])]))
).reset_index(name='Tags_combined')

df_tags.set_index(['Decline', 'Source'], inplace=True)

# Map the tags to a string, separating them by new lines
df_tags['Tags_combined'] = df_tags['Tags_combined'].map(lambda tags: '\n'.join(tags) if tags else None)

df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined
Decline,Source,Unnamed: 2_level_1
0,After,"MsRosieBea\nMsRosieBea,21st birthday,birthday,..."
0,Before,"MsRosieBea\nMsRosieBea,OUTFIT DIARIES\nMsRosie..."
1,After,"hollow,generationhollow,gameplay,review,guide,..."
1,Before,"hollow,generationhollow,playthrough,blind play..."
2,After,
...,...,...
36595,Before,"Despacito accordion cover,Fonsi Despacito acco..."
36597,After,"Patriot Attitude,Plasma Art,#whenhellfreezesov..."
36597,Before,"Bridgeport,Stainless Steel Placards,Roller Kit..."
36598,After,"Music,beats,instrumental,right beat radio,rhod..."


In [8]:
import string

CASEFOLD = False

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_str(s):
    if not isinstance(s, str) or not s.strip(): # Cases where s = None
        return []
    s = s.lower()
    tokens = word_tokenize(s.lower() if CASEFOLD else s, preserve_line=True)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

## Test with a small dataset

In [6]:
df_small = df_tags.head(100)
print(df_small.shape)

(100, 1)


In [9]:
print("Tokenizing and lemmatizing tags")
df_small['Tokens'] = None
for index, row in tqdm(df_small.iterrows(), total=df_small.shape[0]):
    df_small.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['Tokens'] = None


Tokenizing and lemmatizing tags


100%|██████████| 100/100 [00:00<00:00, 375.81it/s]


In [10]:

# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_small['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_small['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=55, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=9)
for topic in topics:
    print(topic)

df_small

Creating dictionary and corpus
Training LDA model
(8, '0.072*"gun" + 0.057*"duty" + 0.054*"call" + 0.050*"ww2" + 0.048*"motion" + 0.048*"slow" + 0.035*"weapon" + 0.034*"battlefield" + 0.032*"animation"')
(32, '0.135*"fortnite" + 0.039*"ninja" + 0.029*"voice" + 0.029*"real" + 0.023*"moment" + 0.023*"funny" + 0.017*"\'s" + 0.015*"faze" + 0.015*"montage"')
(17, '0.000*"wander" + 0.000*"corbin" + 0.000*"van" + 0.000*"kelsey" + 0.000*"life" + 0.000*"pregnancy" + 0.000*"house" + 0.000*"new" + 0.000*"rv"')
(22, '0.037*"urdu" + 0.025*"opener" + 0.025*"youtube" + 0.022*"illuminati" + 0.021*"tv" + 0.021*"hindi" + 0.020*"time" + 0.019*"space" + 0.016*"poem"')
(42, '0.032*"fortnitegod" + 0.030*"fortnite" + 0.020*"fortnitemyth" + 0.019*"howtofortnite" + 0.018*"ninjafortnite" + 0.018*"fortniteps4" + 0.016*"bestplayerinconsole" + 0.016*"fortnitebestplayer" + 0.013*"fortniteninja"')
(48, '0.097*"2017" + 0.089*"hees" + 0.071*"cusub" + 0.056*"lafoole" + 0.050*"somali" + 0.040*"indho" + 0.040*"nasteexo" 

Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined,Tokens
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,After,"MsRosieBea\nMsRosieBea,21st birthday,birthday,...","[msrosiebea, msrosiebea,21st, birthday, birthd..."
0,Before,"MsRosieBea\nMsRosieBea,OUTFIT DIARIES\nMsRosie...","[msrosiebea, msrosiebea, outfit, diary, msrosi..."
1,After,"hollow,generationhollow,gameplay,review,guide,...","[hollow, generationhollow, gameplay, review, g..."
1,Before,"hollow,generationhollow,playthrough,blind play...","[hollow, generationhollow, playthrough, blind,..."
2,After,,[]
...,...,...,...
55,After,"boris Johnson,Brexit,new pm boris Johnson,Brex...","[boris, johnson, brexit, new, pm, boris, johns..."
55,Before,"james o'brien,james o'brien brexit,brexit disa...","[james, o'brien, james, o'brien, brexit, brexi..."
57,After,"suicide boys type beat,ghostemane type beat,xx...","[suicide, boy, type, beat, ghostemane, type, b..."
57,Before,"beat,jazz,sax,saxofon,rhodes,chill,dark,experi...","[beat, jazz, sax, saxofon, rhodes, chill, dark..."


In [11]:
print("Assigning topics to each document")

# Assign the dominant topic to each document
def assign_dominant_topic(tokens, lda_model, dictionary):
    if not tokens or not isinstance(tokens, list):  # Handle empty or invalid tokens
        return None, None
    bow = dictionary.doc2bow(tokens)  # Convert tokens to bag-of-words format
    topic_probs = lda_model.get_document_topics(bow)  # Get topic distribution
    if topic_probs:
        dominant_topic, prob = max(topic_probs, key=lambda x: x[1])  # Most probable topic
        return dominant_topic, prob
    return None, None

df_small['Dominant_Topic'], df_small['Topic_Probability'] = zip(
    *df_small['Tokens'].apply(lambda tokens: assign_dominant_topic(tokens, lda, dictionary))
)

print(df_small.head(20))
df_small.to_csv('data/df_small_sample.csv', index=False)

Assigning topics to each document
                                                    Tags_combined  \
Decline Source                                                      
0       After   MsRosieBea\nMsRosieBea,21st birthday,birthday,...   
        Before  MsRosieBea\nMsRosieBea,OUTFIT DIARIES\nMsRosie...   
1       After   hollow,generationhollow,gameplay,review,guide,...   
        Before  hollow,generationhollow,playthrough,blind play...   
2       After                                                None   
        Before                                               None   
3       After   pubg,killstreak,player unknowns battleground,b...   
        Before  Rust,wipe,wipe day,horrible,a horrible wipe,lu...   
4       After   breyerfest,BreyerFest 2019,2019,2018,30th anni...   
        Before  Breyer,BreyerFest,CollectA,American Alligator,...   
5       After   Triple Entray,Phora,Drake,Eminem,Justin Bieber...   
        Before  Triple Entray,Hip hop,Phora,Eminem,King Lil G,...   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['Dominant_Topic'], df_small['Topic_Probability'] = zip(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['Dominant_Topic'], df_small['Topic_Probability'] = zip(


We want to create two columns [Topic_change] and [Tokens_change] to determine if there is a difference between the tags before and after a decline. A change in tokens is used for granular analysis while a change is topics is more appropriate for detecting higher-level patterns.

In [12]:
df_small = df_small.dropna(subset=['Tokens', 'Dominant_Topic'])

# Pivot the dataset, keeping 'Dominant_topic' in a separate column
df_pivot = df_small.pivot_table(
    index='Decline',  # The index will be based on the 'Decline'
    columns='Source',  # We are splitting by 'Source' (Before and After)
    values=['Tokens', 'Dominant_Topic'],  # We want both Tokens and Dominant_topic in the pivoted table
    aggfunc={
        'Tokens': lambda x: ' '.join([item for sublist in x for item in sublist]),  # Flatten and join the tokens
        'Dominant_Topic': lambda x: x.mode()[0]  # Get the most frequent dominant topic (mode)
    }
)

df_pivot.head()

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens
Source,After,Before,After,Before
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,35.0,1.0,"msrosiebea msrosiebea,21st birthday birthday r...",msrosiebea msrosiebea outfit diary msrosiebea ...
1,25.0,25.0,hollow generationhollow gameplay review guide ...,hollow generationhollow playthrough blind play...
3,31.0,44.0,pubg killstreak player unknown battleground ba...,rust wipe wipe day horrible horrible wipe luck...
4,49.0,49.0,"breyerfest breyerfest 2019,2019,2018,30th anni...",breyer breyerfest collecta american alligator ...
5,45.0,45.0,triple entray phora drake eminem justin bieber...,triple entray hip hop phora eminem king lil g ...


In [13]:
def token_change(tokens_before, tokens_after):
    # Ensure tokens are lists and not NaN or float
    if not isinstance(tokens_before, list):
        tokens_before = []
    if not isinstance(tokens_after, list):
        tokens_after = []
        
    # Compare sets of tokens
    set_before = set(tokens_before)
    set_after = set(tokens_after)
    return set_before != set_after  # Change if the sets are not identical

# Apply the token change function to compare the tokens before and after for each decline
df_pivot['Token_Change'] = df_pivot.apply(
    lambda row: token_change(row[('Tokens', 'Before')], row[('Tokens', 'After')]), axis=1)

# Assuming 'Dominant_topic' columns are available for 'Before' and 'After'
df_pivot['Topic_Change'] = df_pivot.apply(
    lambda row: row[('Dominant_Topic', 'Before')] != row[('Dominant_Topic', 'After')], axis=1)

# Verify the results
df_pivot.head()

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens,Token_Change,Topic_Change
Source,After,Before,After,Before,Unnamed: 5_level_1,Unnamed: 6_level_1
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,35.0,1.0,"msrosiebea msrosiebea,21st birthday birthday r...",msrosiebea msrosiebea outfit diary msrosiebea ...,False,True
1,25.0,25.0,hollow generationhollow gameplay review guide ...,hollow generationhollow playthrough blind play...,False,False
3,31.0,44.0,pubg killstreak player unknown battleground ba...,rust wipe wipe day horrible horrible wipe luck...,False,True
4,49.0,49.0,"breyerfest breyerfest 2019,2019,2018,30th anni...",breyer breyerfest collecta american alligator ...,False,False
5,45.0,45.0,triple entray phora drake eminem justin bieber...,triple entray hip hop phora eminem king lil g ...,False,False


In [14]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda, texts=df_small['Tokens'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}') # 0.7475 with 55 topics, numwords = 9


Coherence Score: 0.6784999505432454


## Preprocessing the whole dataset

In [15]:
print("Tokenizing and lemmatizing tags")
df_tags['Tokens'] = None
for index, row in tqdm(df_tags.iterrows(), total=df_tags.shape[0]):
    df_tags.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])

Tokenizing and lemmatizing tags


100%|██████████| 61194/61194 [03:29<00:00, 292.74it/s]


In [16]:

# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_tags['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_tags['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=15)
for topic in topics:
    print(topic)

Creating dictionary and corpus
Training LDA model
(0, '0.023*"car" + 0.013*"golf" + 0.010*"train" + 0.009*"asmr" + 0.009*"travel" + 0.007*"crash" + 0.006*"racing" + 0.006*"dubai" + 0.006*"fishing" + 0.006*"bike" + 0.006*"race" + 0.006*"road" + 0.006*"park" + 0.005*"street" + 0.005*"tour"')
(1, '0.043*"gta" + 0.041*"v" + 0.039*"movie" + 0.038*"5" + 0.026*"online" + 0.022*"season" + 0.020*"trailer" + 0.015*"review" + 0.015*"comic" + 0.014*"4" + 0.014*"episode" + 0.012*"film" + 0.011*"6" + 0.010*"marvel" + 0.010*"money"')
(2, '0.076*"news" + 0.031*"2019" + 0.030*"video" + 0.027*"cricket" + 0.023*"hindi" + 0.022*"latest" + 0.017*"song" + 0.017*"2018" + 0.016*"pgt" + 0.016*"indian" + 0.016*"india" + 0.014*"new" + 0.014*"pakistan" + 0.014*"live" + 0.012*"tv"')
(3, '0.063*"pokemon" + 0.021*"android" + 0.018*"free" + 0.017*"go" + 0.016*"best" + 0.013*"iphone" + 0.013*"review" + 0.011*"card" + 0.011*"new" + 0.010*"moon" + 0.010*"sun" + 0.010*"io" + 0.010*"shiny" + 0.008*"pro" + 0.008*"hack"')
(

In [17]:
topics = lda.print_topics(num_words=15)

# Create a DataFrame from the topics
topics_data = []
for topic_id, topic in topics:
    topics_data.append({"Topic": topic_id, "Words": topic})

topics_df = pd.DataFrame(topics_data)

topics_df.to_csv("data/lda_topics.csv", index=False)
print("Topics saved to lda_topics.csv")

Topics saved to lda_topics.csv


In [18]:
print("Assigning topics to each document")

def assign_dominant_topic(tokens, lda_model, dictionary):
    if not tokens or not isinstance(tokens, list):  # Handle empty or invalid tokens
        return None, None
    bow = dictionary.doc2bow(tokens)  # Convert tokens to bag-of-words format
    topic_probs = lda_model.get_document_topics(bow)  # Get topic distribution
    if topic_probs:
        dominant_topic, prob = max(topic_probs, key=lambda x: x[1])  # Most probable topic
        return dominant_topic, prob
    return None, None

df_tags['Dominant_Topic'], df_tags['Topic_Probability'] = zip(
    *df_tags['Tokens'].apply(lambda tokens: assign_dominant_topic(tokens, lda, dictionary))
)

print(df_tags.head(20))
df_tags.to_csv('data/df_small_sample.csv', index=False)

Assigning topics to each document
                                                    Tags_combined  \
Decline Source                                                      
0       After   MsRosieBea\nMsRosieBea,21st birthday,birthday,...   
        Before  MsRosieBea\nMsRosieBea,OUTFIT DIARIES\nMsRosie...   
1       After   hollow,generationhollow,gameplay,review,guide,...   
        Before  hollow,generationhollow,playthrough,blind play...   
2       After                                                None   
        Before                                               None   
3       After   pubg,killstreak,player unknowns battleground,b...   
        Before  Rust,wipe,wipe day,horrible,a horrible wipe,lu...   
4       After   breyerfest,BreyerFest 2019,2019,2018,30th anni...   
        Before  Breyer,BreyerFest,CollectA,American Alligator,...   
5       After   Triple Entray,Phora,Drake,Eminem,Justin Bieber...   
        Before  Triple Entray,Hip hop,Phora,Eminem,King Lil G,...   


In [19]:
df_tags = df_tags.dropna(subset=['Tokens', 'Dominant_Topic'])

# Pivot the dataset
df_pivot = df_tags.pivot_table(
    index='Decline',  
    columns='Source',  
    values=['Tokens', 'Dominant_Topic'],  
    aggfunc={
        'Tokens': lambda x: ' '.join([item for sublist in x for item in sublist]),  # Flatten and join the tokens
        'Dominant_Topic': lambda x: x.mode()[0]  # Get the most frequent dominant topic
    }
)

df_pivot.head(20)

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens
Source,After,Before,After,Before
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,15.0,15.0,"msrosiebea msrosiebea,21st birthday birthday r...",msrosiebea msrosiebea outfit diary msrosiebea ...
1,12.0,12.0,hollow generationhollow gameplay review guide ...,hollow generationhollow playthrough blind play...
3,5.0,5.0,pubg killstreak player unknown battleground ba...,rust wipe wipe day horrible horrible wipe luck...
4,19.0,19.0,"breyerfest breyerfest 2019,2019,2018,30th anni...",breyer breyerfest collecta american alligator ...
5,7.0,7.0,triple entray phora drake eminem justin bieber...,triple entray hip hop phora eminem king lil g ...
7,16.0,0.0,yasha yasha jeltuhin cyr cyr wheel circus akro...,yasha yasha jeltuhin akrosphere circus jen mac...
8,7.0,7.0,free type beat free untagged beat playboi cart...,impulsebeats yung impulse impulse beat impulse...
10,17.0,17.0,pumpkin patch best pumpkin patch baby toddler ...,step wander stepstowander thewanderfamily wand...
11,12.0,12.0,pharmit pharmit24 malaysia awesome gamer youtu...,pharmit pharmit24 malaysia awesome gamer youtu...
12,17.0,17.0,twintalksballet twin twin talk talk ballet bal...,twintalksballet twin talk ballet ballerina dan...


In [20]:
def token_change(tokens_before, tokens_after):
    # Ensure tokens are lists and not NaN or float
    if not isinstance(tokens_before, list):
        tokens_before = []
    if not isinstance(tokens_after, list):
        tokens_after = []
        
    set_before = set(tokens_before)
    set_after = set(tokens_after)
    return set_before != set_after 

df_pivot['Token_Change'] = df_pivot.apply(
    lambda row: token_change(row[('Tokens', 'Before')], row[('Tokens', 'After')]), axis=1)

df_pivot['Topic_Change'] = df_pivot.apply(
    lambda row: row[('Dominant_Topic', 'Before')] != row[('Dominant_Topic', 'After')], axis=1)

df_pivot.head()

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens,Token_Change,Topic_Change
Source,After,Before,After,Before,Unnamed: 5_level_1,Unnamed: 6_level_1
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,15.0,15.0,"msrosiebea msrosiebea,21st birthday birthday r...",msrosiebea msrosiebea outfit diary msrosiebea ...,False,False
1,12.0,12.0,hollow generationhollow gameplay review guide ...,hollow generationhollow playthrough blind play...,False,False
3,5.0,5.0,pubg killstreak player unknown battleground ba...,rust wipe wipe day horrible horrible wipe luck...,False,False
4,19.0,19.0,"breyerfest breyerfest 2019,2019,2018,30th anni...",breyer breyerfest collecta american alligator ...,False,False
5,7.0,7.0,triple entray phora drake eminem justin bieber...,triple entray hip hop phora eminem king lil g ...,False,False


In [21]:
# Creation of a new csv file 
df_topic_change = df_pivot.reset_index()
df_topic_change = df_topic_change[['Decline', 'Topic_Change', 'Dominant_Topic']]
df_topic_change.to_csv('data/df_topic_change_20_15w.csv', index=False)

In [22]:
print(f"{df_topic_change['Topic_Change'].mean() * 100:.2f}% of the channels changed the topic of the videos after the start of the decline.")

34.87% of the channels changed the topic of the videos after the start of the decline.


In [None]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda, texts=df_tags['Tokens'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')
# small df: optimum at 0.7475 with 55 topics, numwords = 9 
# whole df: 0.6525 with 55 topics, 0.5991 with 50, 