In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import ldamodel
from tqdm import tqdm
from gensim import corpora
from src.utils.recovery_analysis_utils import str_to_list

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

decline_events = pd.read_csv('data/sampled_decline_events_with_videos.csv')
videos = pd.read_csv('data/videos_around_declines.csv')

decline_events['Videos_before'] = decline_events['Videos_before'].apply(str_to_list)
decline_events['Videos_after'] = decline_events['Videos_after'].apply(str_to_list)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
# Create a data_frame with 2 index: the index of the decline and the source (before and after)

df_before = decline_events[['Videos_before']].explode('Videos_before')
df_before['Source'] = 'Before'
df_before = df_before.rename(columns={'Videos_before': 'Video'})

df_after = decline_events[['Videos_after']].explode('Videos_after')
df_after['Source'] = 'After'
df_after = df_after.rename(columns={'Videos_after': 'Video'})

df_tags = pd.concat([df_before, df_after], axis=0).reset_index().rename(columns={'index': 'Decline'})
df_tags = df_tags.set_index(['Decline', 'Source'])

df_tags.sort_values(by = ['Decline', 'Source'])
df_tags = df_tags.dropna()

In [44]:
# Map to obtain the tags of all videos for each video before and after decline
df_tags['Tags'] = df_tags['Video'].map(lambda video: videos.loc[video, 'tags'] if video in videos.index else None)
df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Video,Tags
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Before,1684989,MsRosieBea
0,Before,1684990,"MsRosieBea,primark haul,primark haul august,pr..."
0,Before,1684991,MsRosieBea
0,Before,1684992,MsRosieBea
0,Before,1684993,"MsRosieBea,red lip,get ready with me"
...,...,...,...
36598,After,1889699,"Music,beats,instrumental,right beat radio,stra..."
36598,After,1889700,"Music,beats,instrumental,right beat radio,late..."
36598,After,1889701,"Music,beats,instrumental,right beat radio,lofi..."
36598,After,1889702,"Music,beats,instrumental,right beat radio,mell..."


In [45]:
# Get for each decline only 2 rows with the tags corresponding to the before and the after, handling NaNs and non-list values
df_tags = df_tags.groupby(['Decline', 'Source'])['Tags'].apply(
    lambda x: list(set([item for sublist in x.dropna() for item in (sublist if isinstance(sublist, list) else [sublist])]))
).reset_index(name='Tags_combined')

df_tags.set_index(['Decline', 'Source'], inplace=True)

# Map the tags to a string, separating them by new lines
df_tags['Tags_combined'] = df_tags['Tags_combined'].map(lambda tags: '\n'.join(tags) if tags else None)

df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined
Decline,Source,Unnamed: 2_level_1
0,After,"MsRosieBea,21st birthday,birthday,ring,jewelle..."
0,Before,"MsRosieBea,uni work,studying fashion design,fa..."
1,After,"hollow,generationhollow,playthrough,blind play..."
1,Before,"hollow,generationhollow,playthrough,blind play..."
2,After,
...,...,...
36595,Before,"Despacito accordion cover,Fonsi Despacito acco..."
36597,After,"Shaper,Clapper,Keith Fenner,Fenner,machine sho..."
36597,Before,"Bridgeport,Stainless Steel Placards,Roller Kit..."
36598,After,"Music,beats,instrumental,right beat radio,minn..."


In [89]:
import string

CASEFOLD = False

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_str(s):
    if not isinstance(s, str) or not s.strip(): # Cases where s = None
        return []
    tokens = word_tokenize(s.lower() if CASEFOLD else s, preserve_line=True)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

In [5]:
print("Tokenizing and lemmatizing tags")
df_tags['Tokens'] = None
for index, row in tqdm(df_tags.iterrows(), total=df_tags.shape[0]):
    df_tags.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])


# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_tags['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_tags['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

df_tags

Tokenizing and lemmatizing tags


100%|██████████| 61194/61194 [27:20<00:00, 37.30it/s]  


Creating dictionary and corpus


TypeError: decoding to str: need a bytes-like object, list found

In [18]:
#print(df_tags['Tokens'].head(10))
# Replace None or NaN in Tokens with empty lists
df_tags['Tokens'] = df_tags['Tokens'].apply(
    lambda x: [] if x is None else x
)


# Flatten any nested lists in Tokens
df_tags['Tokens'] = df_tags['Tokens'].apply(
    lambda tokens: [item for sublist in tokens for item in sublist] if any(isinstance(i, list) for i in tokens) else tokens
    if isinstance(tokens, list) else []
)

# Check the cleaned Tokens column
print(df_tags['Tokens'])

Decline  Source
0        After     [M, R, e, B, e, ,, 2, 1, b, r, h, ,, b, r, h, ...
         Before    [M, R, e, B, e, ,, u, n, w, r, k, ,, u, n, g, ...
1        After     [h, l, l, w, ,, g, e, n, e, r, n, h, l, l, w, ...
         Before    [h, l, l, w, ,, g, e, n, e, r, n, h, l, l, w, ...
2        After                                                    []
                                         ...                        
36595    Before    [D, e, p, c, c, c, r, n, c, v, e, r, ,, F, n, ...
36597    After     [S, h, p, e, r, ,, C, l, p, p, e, r, ,, K, e, ...
         Before    [B, r, g, e, p, r, ,, S, n, l, e, S, e, e, l, ...
36598    After     [M, u, c, ,, b, e, ,, n, r, u, e, n, l, ,, r, ...
         Before    [M, u, c, ,, b, e, ,, n, r, u, e, n, l, ,, r, ...
Name: Tokens, Length: 61194, dtype: object


In [15]:
df_tags['Tokens'].apply(type).value_counts()

Tokens
<class 'list'>        57519
<class 'NoneType'>     3675
Name: count, dtype: int64

In [17]:
# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_tags['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_tags['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

df_tags

Creating dictionary and corpus
Training LDA model
(0, '0.071*"," + 0.047*"а" + 0.043*"и" + 0.040*"о" + 0.035*"е"')
(1, '0.117*"," + 0.112*"e" + 0.069*"n" + 0.068*"r" + 0.043*"l"')
(2, '0.132*"1" + 0.112*"2" + 0.091*"0" + 0.068*"," + 0.058*"e"')
(3, '0.097*"E" + 0.090*"A" + 0.075*"I" + 0.070*"O" + 0.069*","')
(4, '0.165*"e" + 0.134*"," + 0.107*"n" + 0.107*"r" + 0.084*"l"')


Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined,Tokens
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,After,"MsRosieBea,21st birthday,birthday,ring,jewelle...","[M, R, e, B, e, ,, 2, 1, b, r, h, ,, b, r, h, ..."
0,Before,"MsRosieBea,uni work,studying fashion design,fa...","[M, R, e, B, e, ,, u, n, w, r, k, ,, u, n, g, ..."
1,After,"hollow,generationhollow,playthrough,blind play...","[h, l, l, w, ,, g, e, n, e, r, n, h, l, l, w, ..."
1,Before,"hollow,generationhollow,playthrough,blind play...","[h, l, l, w, ,, g, e, n, e, r, n, h, l, l, w, ..."
2,After,,[]
...,...,...,...
36595,Before,"Despacito accordion cover,Fonsi Despacito acco...","[D, e, p, c, c, c, r, n, c, v, e, r, ,, F, n, ..."
36597,After,"Shaper,Clapper,Keith Fenner,Fenner,machine sho...","[S, h, p, e, r, ,, C, l, p, p, e, r, ,, K, e, ..."
36597,Before,"Bridgeport,Stainless Steel Placards,Roller Kit...","[B, r, g, e, p, r, ,, S, n, l, e, S, e, e, l, ..."
36598,After,"Music,beats,instrumental,right beat radio,minn...","[M, u, c, ,, b, e, ,, n, r, u, e, n, l, ,, r, ..."


## Test with a small dataset

In [90]:
df_small = df_tags.head(100)
print(df_small.shape)

(100, 1)


In [91]:
print("Tokenizing and lemmatizing tags")
df_small['Tokens'] = None
for index, row in tqdm(df_small.iterrows(), total=df_small.shape[0]):
    df_small.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['Tokens'] = None


Tokenizing and lemmatizing tags


100%|██████████| 100/100 [00:00<00:00, 211.45it/s]


In [149]:

# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_small['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_small['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=55, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=9)
for topic in topics:
    print(topic)

df_small

Creating dictionary and corpus
Training LDA model
(17, '0.090*"Fortnite" + 0.063*"fortnite" + 0.032*"pokemon" + 0.029*"Ninja" + 0.020*"voice" + 0.020*"real" + 0.015*"shiny" + 0.014*"montage" + 0.013*"challenge"')
(16, '0.000*"BLACKPINK" + 0.000*"DDU" + 0.000*"DU" + 0.000*"SQUARE" + 0.000*"UP" + 0.000*"brexit" + 0.000*"WW2" + 0.000*"블랙핑크" + 0.000*"Call"')
(54, '0.034*"tip" + 0.026*"guide" + 0.024*"hollow" + 0.023*"generationhollow" + 0.021*"tutorial" + 0.020*"gameplay" + 0.016*"trick" + 0.016*"playthrough" + 0.016*"top"')
(10, '0.049*"reaction" + 0.033*"kpop" + 0.017*"BLACKPINK" + 0.017*"블랙핑크" + 0.017*"blackpink" + 0.017*"Shane" + 0.017*"shane" + 0.011*"house" + 0.011*"FANSIGN"')
(32, '0.020*"impulse" + 0.007*"j" + 0.007*"cole" + 0.007*"drake" + 0.007*"prod" + 0.007*"uzi" + 0.007*"migos" + 0.007*"impulsebeats" + 0.007*"beatz"')
(33, '0.058*"brexit" + 0.033*"james" + 0.032*"o\'brien" + 0.021*"Brexit" + 0.019*"uk" + 0.017*"post" + 0.017*"trump" + 0.014*"caller" + 0.013*"Johnson"')
(45, '0

Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined,Tokens,Dominant_Topic,Topic_Probability
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,After,"MsRosieBea,21st birthday,birthday,ring,jewelle...","[MsRosieBea,21st, birthday, birthday, ring, je...",35,0.399864
0,Before,"MsRosieBea,uni work,studying fashion design,fa...","[MsRosieBea, uni, work, studying, fashion, des...",35,0.986550
1,After,"hollow,generationhollow,playthrough,blind play...","[hollow, generationhollow, playthrough, blind,...",17,0.998292
1,Before,"hollow,generationhollow,playthrough,blind play...","[hollow, generationhollow, playthrough, blind,...",17,0.996498
3,After,"dayz,dayz standalone,.62,update,map,loot,inter...","[dayz, dayz, standalone, .62, update, map, loo...",31,0.899641
...,...,...,...,...,...
55,After,"Brexit,Boris Johnson,PM,Boris,no deal,brexitee...","[Brexit, Boris, Johnson, PM, Boris, deal, brex...",6,0.999234
55,Before,"james o'brien,james o'brien brexit,brexit disa...","[james, o'brien, james, o'brien, brexit, brexi...",6,0.999397
57,After,"base de rap,pista de rap,hip hop instrumental,...","[base, de, rap, pista, de, rap, hip, hop, inst...",16,0.997112
57,Before,"witch house type beat,A$AP rocky type,Suicide ...","[witch, house, type, beat, A, AP, rocky, type,...",16,0.997988


In [150]:
print("Assigning topics to each document")

# Assign the dominant topic to each document
def assign_dominant_topic(tokens, lda_model, dictionary):
    if not tokens or not isinstance(tokens, list):  # Handle empty or invalid tokens
        return None, None
    bow = dictionary.doc2bow(tokens)  # Convert tokens to bag-of-words format
    topic_probs = lda_model.get_document_topics(bow)  # Get topic distribution
    if topic_probs:
        dominant_topic, prob = max(topic_probs, key=lambda x: x[1])  # Most probable topic
        return dominant_topic, prob
    return None, None

df_small['Dominant_Topic'], df_small['Topic_Probability'] = zip(
    *df_small['Tokens'].apply(lambda tokens: assign_dominant_topic(tokens, lda, dictionary))
)

print(df_small.head(20))
df_small.to_csv('df_small_sample.csv', index=False)

Assigning topics to each document
                                                    Tags_combined  \
Decline Source                                                      
0       After   MsRosieBea,21st birthday,birthday,ring,jewelle...   
        Before  MsRosieBea,uni work,studying fashion design,fa...   
1       After   hollow,generationhollow,playthrough,blind play...   
        Before  hollow,generationhollow,playthrough,blind play...   
3       After   dayz,dayz standalone,.62,update,map,loot,inter...   
        Before  dayz,dayz standalone,.62,update,map,loot,inter...   
4       After   Halloween,spooky,scary,creepy,Bogeyman,Breyer,...   
        Before  HoneyheartsC,MyFroggyStuff,Infinity Breyers,da...   
5       After   Triple Entray,Phora,Drake,Eminem,Justin Bieber...   
        Before  Hip hop,Triple Entray,Drake,Eminem,Logic,Phora...   
7       After   Yasha,Yasha Jeltuhin,Cyr,Cyr Wheel,Circus,Akro...   
        Before  Yasha,Yasha Jeltuhin,Akrosphere,Circus,Jen Mac...   


We want to create two columns [Topic_change] and [Tokens_change] to determine if there is a difference between the tags before and after a decline. A change in tokens is used for granular analysis while a change is topics is more appropriate for detecting higher-level patterns.

In [151]:
df_small = df_small.dropna(subset=['Tokens', 'Dominant_Topic'])

# Pivot the dataset, keeping 'Dominant_topic' in a separate column
df_pivot = df_small.pivot_table(
    index='Decline',  # The index will be based on the 'Decline'
    columns='Source',  # We are splitting by 'Source' (Before and After)
    values=['Tokens', 'Dominant_Topic'],  # We want both Tokens and Dominant_topic in the pivoted table
    aggfunc={
        'Tokens': lambda x: ' '.join([item for sublist in x for item in sublist]),  # Flatten and join the tokens
        'Dominant_Topic': lambda x: x.mode()[0]  # Get the most frequent dominant topic (mode)
    }
)

df_pivot.head()

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens
Source,After,Before,After,Before
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,40.0,33.0,"MsRosieBea,21st birthday birthday ring jewelle...",MsRosieBea uni work studying fashion design fa...
1,54.0,54.0,hollow generationhollow playthrough blind play...,hollow generationhollow playthrough blind play...
3,21.0,21.0,dayz dayz standalone .62 update map loot inter...,dayz dayz standalone .62 update map loot inter...
4,35.0,41.0,Halloween spooky scary creepy Bogeyman Breyer ...,HoneyheartsC MyFroggyStuff Infinity Breyers da...
5,36.0,36.0,Triple Entray Phora Drake Eminem Justin Bieber...,Hip hop Triple Entray Drake Eminem Logic Phora...


In [152]:
def token_change(tokens_before, tokens_after):
    # Ensure tokens are lists and not NaN or float
    if not isinstance(tokens_before, list):
        tokens_before = []
    if not isinstance(tokens_after, list):
        tokens_after = []
        
    # Compare sets of tokens
    set_before = set(tokens_before)
    set_after = set(tokens_after)
    return set_before != set_after  # Change if the sets are not identical

# Apply the token change function to compare the tokens before and after for each decline
df_pivot['Token_Change'] = df_pivot.apply(
    lambda row: token_change(row[('Tokens', 'Before')], row[('Tokens', 'After')]), axis=1)

# Assuming 'Dominant_topic' columns are available for 'Before' and 'After'
df_pivot['Topic_Change'] = df_pivot.apply(
    lambda row: row[('Dominant_Topic', 'Before')] != row[('Dominant_Topic', 'After')], axis=1)

# Verify the results
df_pivot.head()

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens,Token_Change,Topic_Change
Source,After,Before,After,Before,Unnamed: 5_level_1,Unnamed: 6_level_1
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,40.0,33.0,"MsRosieBea,21st birthday birthday ring jewelle...",MsRosieBea uni work studying fashion design fa...,False,True
1,54.0,54.0,hollow generationhollow playthrough blind play...,hollow generationhollow playthrough blind play...,False,False
3,21.0,21.0,dayz dayz standalone .62 update map loot inter...,dayz dayz standalone .62 update map loot inter...,False,False
4,35.0,41.0,Halloween spooky scary creepy Bogeyman Breyer ...,HoneyheartsC MyFroggyStuff Infinity Breyers da...,False,True
5,36.0,36.0,Triple Entray Phora Drake Eminem Justin Bieber...,Hip hop Triple Entray Drake Eminem Logic Phora...,False,False


In [154]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda, texts=df_small['Tokens'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}') # 0.7475 with 55 topics, numwords = 9


Coherence Score: 0.7474957968694171
