In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import ldamodel
from tqdm import tqdm
from gensim import corpora
from src.utils.recovery_analysis_utils import str_to_list

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

decline_events = pd.read_csv('data/sampled_decline_events_with_videos.csv')
videos = pd.read_csv('data/videos_around_declines.csv')

decline_events['Videos_before'] = decline_events['Videos_before'].apply(str_to_list)
decline_events['Videos_after'] = decline_events['Videos_after'].apply(str_to_list)

In [3]:
# Create a data_frame with 2 index: the index of the decline and the source (before and after)

df_before = decline_events[['Videos_before']].explode('Videos_before')
df_before['Source'] = 'Before'
df_before = df_before.rename(columns={'Videos_before': 'Video'})

df_after = decline_events[['Videos_after']].explode('Videos_after')
df_after['Source'] = 'After'
df_after = df_after.rename(columns={'Videos_after': 'Video'})

df_tags = pd.concat([df_before, df_after], axis=0).reset_index().rename(columns={'index': 'Decline'})
df_tags = df_tags.set_index(['Decline', 'Source'])

df_tags.sort_values(by = ['Decline', 'Source'])
df_tags = df_tags.dropna()

In [4]:
# Map to obtain the tags of all videos for each video before and after decline
df_tags['Tags'] = df_tags['Video'].map(lambda video: videos.loc[video, 'tags'] if video in videos.index else None)
df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Video,Tags
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Before,1684989,MsRosieBea
0,Before,1684990,"MsRosieBea,primark haul,primark haul august,pr..."
0,Before,1684991,MsRosieBea
0,Before,1684992,MsRosieBea
0,Before,1684993,"MsRosieBea,red lip,get ready with me"
...,...,...,...
36598,After,1889699,"Music,beats,instrumental,right beat radio,stra..."
36598,After,1889700,"Music,beats,instrumental,right beat radio,late..."
36598,After,1889701,"Music,beats,instrumental,right beat radio,lofi..."
36598,After,1889702,"Music,beats,instrumental,right beat radio,mell..."


In [5]:
# Get for each decline only 2 rows with the tags corresponding to the before and the after, handling NaNs and non-list values
df_tags = df_tags.groupby(['Decline', 'Source'])['Tags'].apply(
    lambda x: list(set([item for sublist in x.dropna() for item in (sublist if isinstance(sublist, list) else [sublist])]))
).reset_index(name='Tags_combined')

df_tags.set_index(['Decline', 'Source'], inplace=True)

# Map the tags to a string, separating them by new lines
df_tags['Tags_combined'] = df_tags['Tags_combined'].map(lambda tags: '\n'.join(tags) if tags else None)

df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined
Decline,Source,Unnamed: 2_level_1
0,After,"MsRosieBea,uni,uni life,first year of uni,thir..."
0,Before,"MsRosieBea,red lip,get ready with me\nMsRosieB..."
1,After,"hollow,generationhollow,tea,questions,qna,answ..."
1,Before,"hollow,generationhollow,playthrough,blind play..."
2,After,
...,...,...
36595,Before,"Despacito accordion cover,Fonsi Despacito acco..."
36597,After,"Babbitt,Babbitt pouring,Keith Fenner,Fenner,ma..."
36597,Before,"Boat Lift,fork lift,Keith Fenner,Fenner,machin..."
36598,After,"Music,beats,instrumental,right beat radio,acou..."


In [25]:
import string

CASEFOLD = False

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_str(s):
    s = s.lower()
    if not isinstance(s, str) or not s.strip(): # Cases where s = None
        return []
    tokens = word_tokenize(s.lower() if CASEFOLD else s, preserve_line=True)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

## Test with a small dataset

In [7]:
df_small = df_tags.head(100)
print(df_small.shape)

(100, 1)


In [26]:
print("Tokenizing and lemmatizing tags")
df_small['Tokens'] = None
for index, row in tqdm(df_small.iterrows(), total=df_small.shape[0]):
    df_small.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['Tokens'] = None


Tokenizing and lemmatizing tags


100%|██████████| 86/86 [00:01<00:00, 53.03it/s] 


In [27]:

# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_small['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_small['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=55, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=9)
for topic in topics:
    print(topic)

df_small

Creating dictionary and corpus
Training LDA model
(14, '0.001*"fortnite" + 0.001*"game" + 0.001*"wander" + 0.001*"video" + 0.001*"continue" + 0.001*"let" + 0.000*"kelsey" + 0.000*"van" + 0.000*"new"')
(32, '0.050*"best" + 0.046*"moment" + 0.045*"funny" + 0.023*"base" + 0.023*"survival" + 0.022*"series" + 0.021*"solo" + 0.021*"loot" + 0.019*"rust"')
(33, '0.023*"v" + 0.019*"flash" + 0.015*"superman" + 0.015*"new" + 0.014*"quicksilver" + 0.012*"holiday" + 0.012*"goku" + 0.012*"52" + 0.012*"mcu"')
(45, '0.016*"zoe" + 0.009*"raven" + 0.008*"mythology" + 0.008*"dead" + 0.008*"complete" + 0.001*"horse" + 0.000*"stormy" + 0.000*"toy" + 0.000*"fortnite"')
(27, '0.216*"fortnite" + 0.098*"skin" + 0.043*"free" + 0.038*"new" + 0.034*"season" + 0.027*"battle" + 0.021*"gifting" + 0.017*"trooper" + 0.016*"7"')
(22, '0.001*"wander" + 0.001*"kelsey" + 0.001*"brexit" + 0.000*"corbin" + 0.000*"o\'brien" + 0.000*"van" + 0.000*"life" + 0.000*"james" + 0.000*"step"')
(18, '0.045*"make-up" + 0.033*"skin" + 0

Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined,Tokens,Dominant_Topic,Topic_Probability
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,After,"MsRosieBea,uni,uni life,first year of uni,thir...","[msrosiebea, uni, uni, life, first, year, uni,...",42.0,0.960726
0,Before,"MsRosieBea,red lip,get ready with me\nMsRosieB...","[msrosiebea, red, lip, get, ready, msrosiebea,...",23.0,0.986549
1,After,"hollow,generationhollow,tea,questions,qna,answ...","[hollow, generationhollow, tea, question, qna,...",36.0,0.987197
1,Before,"hollow,generationhollow,playthrough,blind play...","[hollow, generationhollow, playthrough, blind,...",36.0,0.971890
3,After,"dayz,loot,betrayal,solo,friends,survive,surviv...","[dayz, loot, betrayal, solo, friend, survive, ...",23.0,0.574609
...,...,...,...,...,...
55,After,"dhs,thomas homan,donald Trump,president trump,...","[dhs, thomas, homan, donald, trump, president,...",33.0,0.998405
55,Before,"james o'brien,theresa may,pm may,james o'brien...","[james, o'brien, theresa, may, pm, may, james,...",33.0,0.999397
57,After,"SUICIDE BOYS TYPE BEAT,$uicide boy$ type,night...","[suicide, boy, type, beat, uicide, boy, type, ...",31.0,0.637635
57,Before,"sad trap beat,sad trap instrumental,goodbye,sa...","[sad, trap, beat, sad, trap, instrumental, goo...",31.0,0.993759


In [28]:
print("Assigning topics to each document")

# Assign the dominant topic to each document
def assign_dominant_topic(tokens, lda_model, dictionary):
    if not tokens or not isinstance(tokens, list):  # Handle empty or invalid tokens
        return None, None
    bow = dictionary.doc2bow(tokens)  # Convert tokens to bag-of-words format
    topic_probs = lda_model.get_document_topics(bow)  # Get topic distribution
    if topic_probs:
        dominant_topic, prob = max(topic_probs, key=lambda x: x[1])  # Most probable topic
        return dominant_topic, prob
    return None, None

df_small['Dominant_Topic'], df_small['Topic_Probability'] = zip(
    *df_small['Tokens'].apply(lambda tokens: assign_dominant_topic(tokens, lda, dictionary))
)

print(df_small.head(20))
df_small.to_csv('data/df_small_sample.csv', index=False)

Assigning topics to each document
                                                    Tags_combined  \
Decline Source                                                      
0       After   MsRosieBea,uni,uni life,first year of uni,thir...   
        Before  MsRosieBea,red lip,get ready with me\nMsRosieB...   
1       After   hollow,generationhollow,tea,questions,qna,answ...   
        Before  hollow,generationhollow,playthrough,blind play...   
3       After   dayz,loot,betrayal,solo,friends,survive,surviv...   
        Before  dayz,dayz standalone,.62,update,map,loot,inter...   
4       After   stormystrike,stormy strikes channel,stormystik...   
        Before  Spirit,Spirit Riding Free,Spirit Stallion of t...   
5       After   Triple Entray,Phora,Drake,Eminem,Justin Bieber...   
        Before  Triple Entray,Hip Hop,Eminem,Phora,Drake,Logic...   
7       After   Yasha,Yasha Jeltuhin,Akrosphere,Trapeze,Circus...   
        Before  Yasha,Yasha Jeltuhin,Akrosphere,Circus,Jen Mac...   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['Dominant_Topic'], df_small['Topic_Probability'] = zip(


We want to create two columns [Topic_change] and [Tokens_change] to determine if there is a difference between the tags before and after a decline. A change in tokens is used for granular analysis while a change is topics is more appropriate for detecting higher-level patterns.

In [29]:
df_small = df_small.dropna(subset=['Tokens', 'Dominant_Topic'])

# Pivot the dataset, keeping 'Dominant_topic' in a separate column
df_pivot = df_small.pivot_table(
    index='Decline',  # The index will be based on the 'Decline'
    columns='Source',  # We are splitting by 'Source' (Before and After)
    values=['Tokens', 'Dominant_Topic'],  # We want both Tokens and Dominant_topic in the pivoted table
    aggfunc={
        'Tokens': lambda x: ' '.join([item for sublist in x for item in sublist]),  # Flatten and join the tokens
        'Dominant_Topic': lambda x: x.mode()[0]  # Get the most frequent dominant topic (mode)
    }
)

df_pivot.head()

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens
Source,After,Before,After,Before
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,19.0,35.0,msrosiebea uni uni life first year uni third y...,msrosiebea red lip get ready msrosiebea bikini...
1,36.0,36.0,hollow generationhollow tea question qna answe...,hollow generationhollow playthrough blind play...
3,32.0,32.0,dayz loot betrayal solo friend survive surviva...,dayz dayz standalone .62 update map loot inter...
4,47.0,47.0,stormystrike stormy strike channel stormystike...,spirit spirit riding free spirit stallion cima...
5,39.0,39.0,triple entray phora drake eminem justin bieber...,triple entray hip hop eminem phora drake logic...


In [30]:
def token_change(tokens_before, tokens_after):
    # Ensure tokens are lists and not NaN or float
    if not isinstance(tokens_before, list):
        tokens_before = []
    if not isinstance(tokens_after, list):
        tokens_after = []
        
    # Compare sets of tokens
    set_before = set(tokens_before)
    set_after = set(tokens_after)
    return set_before != set_after  # Change if the sets are not identical

# Apply the token change function to compare the tokens before and after for each decline
df_pivot['Token_Change'] = df_pivot.apply(
    lambda row: token_change(row[('Tokens', 'Before')], row[('Tokens', 'After')]), axis=1)

# Assuming 'Dominant_topic' columns are available for 'Before' and 'After'
df_pivot['Topic_Change'] = df_pivot.apply(
    lambda row: row[('Dominant_Topic', 'Before')] != row[('Dominant_Topic', 'After')], axis=1)

# Verify the results
df_pivot.head()

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens,Token_Change,Topic_Change
Source,After,Before,After,Before,Unnamed: 5_level_1,Unnamed: 6_level_1
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,19.0,35.0,msrosiebea uni uni life first year uni third y...,msrosiebea red lip get ready msrosiebea bikini...,False,True
1,36.0,36.0,hollow generationhollow tea question qna answe...,hollow generationhollow playthrough blind play...,False,False
3,32.0,32.0,dayz loot betrayal solo friend survive surviva...,dayz dayz standalone .62 update map loot inter...,False,False
4,47.0,47.0,stormystrike stormy strike channel stormystike...,spirit spirit riding free spirit stallion cima...,False,False
5,39.0,39.0,triple entray phora drake eminem justin bieber...,triple entray hip hop eminem phora drake logic...,False,False


In [31]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda, texts=df_small['Tokens'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}') # 0.7475 with 55 topics, numwords = 9


Coherence Score: 0.6721154057246735


## Preprocessing the whole dataset

In [32]:
print("Tokenizing and lemmatizing tags")
df_tags['Tokens'] = None
for index, row in tqdm(df_tags.iterrows(), total=df_tags.shape[0]):
    df_tags.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])

Tokenizing and lemmatizing tags


100%|██████████| 57517/57517 [12:53<00:00, 74.34it/s] 


In [33]:

# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_tags['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_tags['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=15)
for topic in topics:
    print(topic)

Creating dictionary and corpus
Training LDA model
(0, '0.057*"beat" + 0.041*"type" + 0.036*"nintendo" + 0.029*"smash" + 0.028*"switch" + 0.023*"free" + 0.022*"trap" + 0.018*"super" + 0.018*"mario" + 0.016*"rap" + 0.015*"2018" + 0.013*"hop" + 0.013*"ultimate" + 0.013*"tennis" + 0.013*"hip"')
(1, '0.049*"minecraft" + 0.039*"dota" + 0.034*"black" + 0.028*"ops" + 0.028*"call" + 0.028*"zombie" + 0.025*"duty" + 0.022*"cod" + 0.019*"ark" + 0.019*"ww2" + 0.017*"3" + 0.015*"mod" + 0.012*"gameplay" + 0.012*"4" + 0.012*"warfare"')
(2, '0.094*"fifa" + 0.075*"18" + 0.067*"19" + 0.045*"team" + 0.042*"pack" + 0.039*"17" + 0.029*"ultimate" + 0.022*"opening" + 0.020*"career" + 0.019*"mode" + 0.019*"squad" + 0.016*"fut" + 0.015*"player" + 0.013*"sbc" + 0.012*"gameplay"')
(3, '0.016*"news" + 0.015*"trump" + 0.009*"house" + 0.007*"show" + 0.006*"entertainment" + 0.006*"politics" + 0.006*"u" + 0.006*"interview" + 0.006*"medium" + 0.006*"donald" + 0.006*"state" + 0.005*"live" + 0.005*"mix" + 0.005*"tmz" + 0

In [40]:
topics = lda.print_topics(num_words=15)

# Create a DataFrame from the topics
topics_data = []
for topic_id, topic in topics:
    topics_data.append({"Topic": topic_id, "Words": topic})

topics_df = pd.DataFrame(topics_data)

topics_df.to_csv("data/lda_topics.csv", index=False)
print("Topics saved to lda_topics.csv")

Topics saved to lda_topics.csv


In [41]:
print("Assigning topics to each document")

def assign_dominant_topic(tokens, lda_model, dictionary):
    if not tokens or not isinstance(tokens, list):  # Handle empty or invalid tokens
        return None, None
    bow = dictionary.doc2bow(tokens)  # Convert tokens to bag-of-words format
    topic_probs = lda_model.get_document_topics(bow)  # Get topic distribution
    if topic_probs:
        dominant_topic, prob = max(topic_probs, key=lambda x: x[1])  # Most probable topic
        return dominant_topic, prob
    return None, None

df_tags['Dominant_Topic'], df_tags['Topic_Probability'] = zip(
    *df_tags['Tokens'].apply(lambda tokens: assign_dominant_topic(tokens, lda, dictionary))
)

print(df_tags.head(20))
df_tags.to_csv('df_small_sample.csv', index=False)

Assigning topics to each document
                                                    Tags_combined  \
Decline Source                                                      
0       After   MsRosieBea,uni,uni life,first year of uni,thir...   
        Before  MsRosieBea,red lip,get ready with me\nMsRosieB...   
1       After   hollow,generationhollow,tea,questions,qna,answ...   
        Before  hollow,generationhollow,playthrough,blind play...   
3       After   dayz,loot,betrayal,solo,friends,survive,surviv...   
        Before  dayz,dayz standalone,.62,update,map,loot,inter...   
4       After   stormystrike,stormy strikes channel,stormystik...   
        Before  Spirit,Spirit Riding Free,Spirit Stallion of t...   
5       After   Triple Entray,Phora,Drake,Eminem,Justin Bieber...   
        Before  Triple Entray,Hip Hop,Eminem,Phora,Drake,Logic...   
7       After   Yasha,Yasha Jeltuhin,Akrosphere,Trapeze,Circus...   
        Before  Yasha,Yasha Jeltuhin,Akrosphere,Circus,Jen Mac...   


In [42]:
df_tags = df_tags.dropna(subset=['Tokens', 'Dominant_Topic'])

# Pivot the dataset
df_pivot = df_tags.pivot_table(
    index='Decline',  
    columns='Source',  
    values=['Tokens', 'Dominant_Topic'],  
    aggfunc={
        'Tokens': lambda x: ' '.join([item for sublist in x for item in sublist]),  # Flatten and join the tokens
        'Dominant_Topic': lambda x: x.mode()[0]  # Get the most frequent dominant topic
    }
)

df_pivot.head(20)

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens
Source,After,Before,After,Before
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,7.0,7.0,msrosiebea uni uni life first year uni third y...,msrosiebea red lip get ready msrosiebea bikini...
1,19.0,19.0,hollow generationhollow tea question qna answe...,hollow generationhollow playthrough blind play...
3,9.0,9.0,dayz loot betrayal solo friend survive surviva...,dayz dayz standalone .62 update map loot inter...
4,18.0,18.0,stormystrike stormy strike channel stormystike...,spirit spirit riding free spirit stallion cima...
5,0.0,0.0,triple entray phora drake eminem justin bieber...,triple entray hip hop eminem phora drake logic...
7,11.0,13.0,yasha yasha jeltuhin akrosphere trapeze circus...,yasha yasha jeltuhin akrosphere circus jen mac...
8,0.0,0.0,free type beat free untagged beat playboi cart...,impulsebeats yung impulse impulse beat impulse...
10,7.0,13.0,wander step wander honeymoon photoshoot best h...,comedy dream house new house new home buying h...
11,19.0,19.0,pharmit pharmit24 malaysia awesome gamer youtu...,pharmit pharmit24 malaysia awesome gamer youtu...
12,7.0,7.0,twintalksballet twin twin talk talk ballet bal...,twintalksballet twin twin talk talk ballet bal...


In [43]:
def token_change(tokens_before, tokens_after):
    # Ensure tokens are lists and not NaN or float
    if not isinstance(tokens_before, list):
        tokens_before = []
    if not isinstance(tokens_after, list):
        tokens_after = []
        
    set_before = set(tokens_before)
    set_after = set(tokens_after)
    return set_before != set_after 

df_pivot['Token_Change'] = df_pivot.apply(
    lambda row: token_change(row[('Tokens', 'Before')], row[('Tokens', 'After')]), axis=1)

df_pivot['Topic_Change'] = df_pivot.apply(
    lambda row: row[('Dominant_Topic', 'Before')] != row[('Dominant_Topic', 'After')], axis=1)

df_pivot.head()

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens,Token_Change,Topic_Change
Source,After,Before,After,Before,Unnamed: 5_level_1,Unnamed: 6_level_1
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,7.0,7.0,msrosiebea uni uni life first year uni third y...,msrosiebea red lip get ready msrosiebea bikini...,False,False
1,19.0,19.0,hollow generationhollow tea question qna answe...,hollow generationhollow playthrough blind play...,False,False
3,9.0,9.0,dayz loot betrayal solo friend survive surviva...,dayz dayz standalone .62 update map loot inter...,False,False
4,18.0,18.0,stormystrike stormy strike channel stormystike...,spirit spirit riding free spirit stallion cima...,False,False
5,0.0,0.0,triple entray phora drake eminem justin bieber...,triple entray hip hop eminem phora drake logic...,False,False


In [44]:
# Creation of a new csv file 
df_topic_change = df_pivot.reset_index()
df_topic_change = df_topic_change[['Decline', 'Topic_Change', 'Dominant_Topic']]
df_topic_change.to_csv('df_topic_change_20_15w.csv', index=False)

In [45]:
print(f"{df_topic_change['Topic_Change'].mean() * 100:.2f}% of the channels changed the topic of the videos after the start of the decline.")

36.41% of the channels changed the topic of the videos after the start of the decline.


In [46]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda, texts=df_tags['Tokens'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')
# small df: optimum at 0.7475 with 55 topics, numwords = 9 
# whole df: 0.6525 with 55 topics, 0.5991 with 50, 

Coherence Score: 0.5810251989073743
