In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import ldamodel
from tqdm import tqdm
from gensim import corpora
from src.utils.recovery_analysis_utils import str_to_list

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

decline_events = pd.read_csv('data/sampled_decline_events_with_videos.csv')
videos = pd.read_csv('data/videos_around_declines.csv')

decline_events['Videos_before'] = decline_events['Videos_before'].apply(str_to_list)
decline_events['Videos_after'] = decline_events['Videos_after'].apply(str_to_list)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Create a data_frame with 2 index: the index of the decline and the source (before and after)

df_before = decline_events[['Videos_before']].explode('Videos_before')
df_before['Source'] = 'Before'
df_before = df_before.rename(columns={'Videos_before': 'Video'})

df_after = decline_events[['Videos_after']].explode('Videos_after')
df_after['Source'] = 'After'
df_after = df_after.rename(columns={'Videos_after': 'Video'})

df_tags = pd.concat([df_before, df_after], axis=0).reset_index().rename(columns={'index': 'Decline'})
df_tags = df_tags.set_index(['Decline', 'Source'])

df_tags.sort_values(by = ['Decline', 'Source'])
df_tags = df_tags.dropna()

In [5]:
# Map to obtain the tags of all videos for each video before and after decline
df_tags['Tags'] = df_tags['Video'].map(lambda video: videos.loc[video, 'tags'] if video in videos.index else None)
df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Video,Tags
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Before,1684989,MsRosieBea
0,Before,1684990,"MsRosieBea,primark haul,primark haul august,pr..."
0,Before,1684991,MsRosieBea
0,Before,1684992,MsRosieBea
0,Before,1684993,"MsRosieBea,red lip,get ready with me"
...,...,...,...
36598,After,1889699,"Music,beats,instrumental,right beat radio,stra..."
36598,After,1889700,"Music,beats,instrumental,right beat radio,late..."
36598,After,1889701,"Music,beats,instrumental,right beat radio,lofi..."
36598,After,1889702,"Music,beats,instrumental,right beat radio,mell..."


In [6]:
# Get for each decline only 2 rows with the tags corresponding to the before and the after, handling NaNs and non-list values
df_tags = df_tags.groupby(['Decline', 'Source'])['Tags'].apply(
    lambda x: list(set([item for sublist in x.dropna() for item in (sublist if isinstance(sublist, list) else [sublist])]))
).reset_index(name='Tags_combined')

df_tags.set_index(['Decline', 'Source'], inplace=True)

# Map the tags to a string, separating them by new lines
df_tags['Tags_combined'] = df_tags['Tags_combined'].map(lambda tags: '\n'.join(tags) if tags else None)

df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined
Decline,Source,Unnamed: 2_level_1
0,After,"MsRosieBea\nMsRosieBea,back to uni,uni outfits..."
0,Before,"MsRosieBea,OUTFIT DIARIES\nMsRosieBea,red lip,..."
1,After,"hollow,generationhollow,gameplay,review,guide,..."
1,Before,"hollow,generationhollow,paragon,gameplay,alpha..."
2,After,
...,...,...
36595,Before,"Despacito accordion cover,Fonsi Despacito acco..."
36597,After,"Babbitt,Babbitt pouring,Keith Fenner,Fenner,ma..."
36597,Before,"Audi,Audi 2.1,Line Bore,Kenax,Line bore Kenax ..."
36598,After,"Music,beats,instrumental,right beat radio,late..."


In [7]:
import string

CASEFOLD = False

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_str(s):
    if not isinstance(s, str) or not s.strip(): # Cases where s = None
        return []
    tokens = word_tokenize(s.lower() if CASEFOLD else s, preserve_line=True)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

In [8]:
print("Tokenizing and lemmatizing tags")
df_tags['Tokens'] = None
for index, row in tqdm(df_tags.iterrows(), total=df_tags.shape[0]):
    df_tags.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])


# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_tags['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_tags['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

df_tags

Tokenizing and lemmatizing tags


100%|██████████| 61194/61194 [18:11<00:00, 56.08it/s]   


Creating dictionary and corpus
Training LDA model
(0, '0.013*"The" + 0.006*"v" + 0.005*"\'s" + 0.005*"Game" + 0.005*"New"')
(1, '0.008*"video" + 0.007*"vlog" + 0.007*"makeup" + 0.006*"family" + 0.006*"music"')
(2, '0.015*"news" + 0.013*"19" + 0.012*"rangoli" + 0.012*"18" + 0.010*"v"')
(3, '0.025*"2" + 0.022*"game" + 0.014*"gameplay" + 0.013*"pokemon" + 0.009*"play"')
(4, '0.026*"fortnite" + 0.012*"5" + 0.010*"game" + 0.010*"best" + 0.009*"gta"')


Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined,Tokens
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,After,"MsRosieBea\nMsRosieBea,back to uni,uni outfits...","[MsRosieBea, MsRosieBea, back, uni, uni, outfi..."
0,Before,"MsRosieBea,OUTFIT DIARIES\nMsRosieBea,red lip,...","[MsRosieBea, OUTFIT, DIARIES, MsRosieBea, red,..."
1,After,"hollow,generationhollow,gameplay,review,guide,...","[hollow, generationhollow, gameplay, review, g..."
1,Before,"hollow,generationhollow,paragon,gameplay,alpha...","[hollow, generationhollow, paragon, gameplay, ..."
2,After,,[]
...,...,...,...
36595,Before,"Despacito accordion cover,Fonsi Despacito acco...","[Despacito, accordion, cover, Fonsi, Despacito..."
36597,After,"Babbitt,Babbitt pouring,Keith Fenner,Fenner,ma...","[Babbitt, Babbitt, pouring, Keith, Fenner, Fen..."
36597,Before,"Audi,Audi 2.1,Line Bore,Kenax,Line bore Kenax ...","[Audi, Audi, 2.1, Line, Bore, Kenax, Line, bor..."
36598,After,"Music,beats,instrumental,right beat radio,late...","[Music, beat, instrumental, right, beat, radio..."


In [9]:
#print(df_tags['Tokens'].head(10))
# Replace None or NaN in Tokens with empty lists
df_tags['Tokens'] = df_tags['Tokens'].apply(
    lambda x: [] if x is None else x
)


# Flatten any nested lists in Tokens
df_tags['Tokens'] = df_tags['Tokens'].apply(
    lambda tokens: [item for sublist in tokens for item in sublist] if any(isinstance(i, list) for i in tokens) else tokens
    if isinstance(tokens, list) else []
)

# Check the cleaned Tokens column
print(df_tags['Tokens'])

Decline  Source
0        After     [MsRosieBea, MsRosieBea, back, uni, uni, outfi...
         Before    [MsRosieBea, OUTFIT, DIARIES, MsRosieBea, red,...
1        After     [hollow, generationhollow, gameplay, review, g...
         Before    [hollow, generationhollow, paragon, gameplay, ...
2        After                                                    []
                                         ...                        
36595    Before    [Despacito, accordion, cover, Fonsi, Despacito...
36597    After     [Babbitt, Babbitt, pouring, Keith, Fenner, Fen...
         Before    [Audi, Audi, 2.1, Line, Bore, Kenax, Line, bor...
36598    After     [Music, beat, instrumental, right, beat, radio...
         Before    [Music, beat, instrumental, right, beat, radio...
Name: Tokens, Length: 61194, dtype: object


In [10]:
df_tags['Tokens'].apply(type).value_counts()

Tokens
<class 'list'>    61194
Name: count, dtype: int64

In [11]:
# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_tags['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_tags['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

df_tags

Creating dictionary and corpus
Training LDA model
(0, '0.011*"video" + 0.011*"music" + 0.009*"beat" + 0.008*"song" + 0.008*"movie"')
(1, '0.020*"2" + 0.020*"game" + 0.017*"fortnite" + 0.016*"gameplay" + 0.009*"pokemon"')
(2, '0.030*"news" + 0.008*"2019" + 0.006*"News" + 0.006*"hindi" + 0.006*"world"')
(3, '0.011*"The" + 0.010*"v" + 0.006*"wwe" + 0.006*"18" + 0.006*"19"')
(4, '0.010*"vlog" + 0.009*"makeup" + 0.009*"family" + 0.006*"funny" + 0.006*"video"')


Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined,Tokens
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,After,"MsRosieBea\nMsRosieBea,back to uni,uni outfits...","[MsRosieBea, MsRosieBea, back, uni, uni, outfi..."
0,Before,"MsRosieBea,OUTFIT DIARIES\nMsRosieBea,red lip,...","[MsRosieBea, OUTFIT, DIARIES, MsRosieBea, red,..."
1,After,"hollow,generationhollow,gameplay,review,guide,...","[hollow, generationhollow, gameplay, review, g..."
1,Before,"hollow,generationhollow,paragon,gameplay,alpha...","[hollow, generationhollow, paragon, gameplay, ..."
2,After,,[]
...,...,...,...
36595,Before,"Despacito accordion cover,Fonsi Despacito acco...","[Despacito, accordion, cover, Fonsi, Despacito..."
36597,After,"Babbitt,Babbitt pouring,Keith Fenner,Fenner,ma...","[Babbitt, Babbitt, pouring, Keith, Fenner, Fen..."
36597,Before,"Audi,Audi 2.1,Line Bore,Kenax,Line bore Kenax ...","[Audi, Audi, 2.1, Line, Bore, Kenax, Line, bor..."
36598,After,"Music,beats,instrumental,right beat radio,late...","[Music, beat, instrumental, right, beat, radio..."


## Test with a small dataset

In [12]:
df_small = df_tags.head(100)
print(df_small.shape)

(100, 2)


In [13]:
print("Tokenizing and lemmatizing tags")
df_small['Tokens'] = None
for index, row in tqdm(df_small.iterrows(), total=df_small.shape[0]):
    df_small.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['Tokens'] = None


Tokenizing and lemmatizing tags


100%|██████████| 100/100 [00:01<00:00, 52.11it/s]


In [14]:

# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_small['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_small['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=55, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=9)
for topic in topics:
    print(topic)

df_small

Creating dictionary and corpus
Training LDA model
(37, '0.031*"sticker" + 0.027*"stick" + 0.027*"planner" + 0.027*"etsy" + 0.018*"plan" + 0.018*"shop" + 0.013*"condren" + 0.013*"planning" + 0.013*"erin"')
(17, '0.000*"funny" + 0.000*"moment" + 0.000*"best" + 0.000*"new" + 0.000*"van" + 0.000*"gameplay" + 0.000*"base" + 0.000*"let" + 0.000*"survival"')
(24, '0.000*"Wander" + 0.000*"van" + 0.000*"life" + 0.000*"Kelsey" + 0.000*"house" + 0.000*"juniper" + 0.000*"baby" + 0.000*"Corbin" + 0.000*"rv"')
(44, '0.081*"2017" + 0.066*"hees" + 0.055*"cusub" + 0.040*"somali" + 0.035*"lafoole" + 0.030*"niiko" + 0.025*"indho" + 0.023*"nasteexo" + 0.020*"ciida"')
(28, '0.049*"fashion" + 0.036*"lookbook" + 0.025*"haul" + 0.021*"summer" + 0.018*"winter" + 0.018*"fall" + 0.017*"indian" + 0.015*"sarojini" + 0.015*"party"')
(10, '0.023*"let" + 0.021*"best" + 0.020*"moment" + 0.019*"funny" + 0.015*"gameplay" + 0.014*"play" + 0.012*"\'s" + 0.011*"new" + 0.010*"player"')
(21, '0.054*"teacher" + 0.047*"hairsty

Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined,Tokens
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,After,"MsRosieBea\nMsRosieBea,back to uni,uni outfits...","[MsRosieBea, MsRosieBea, back, uni, uni, outfi..."
0,Before,"MsRosieBea,OUTFIT DIARIES\nMsRosieBea,red lip,...","[MsRosieBea, OUTFIT, DIARIES, MsRosieBea, red,..."
1,After,"hollow,generationhollow,gameplay,review,guide,...","[hollow, generationhollow, gameplay, review, g..."
1,Before,"hollow,generationhollow,paragon,gameplay,alpha...","[hollow, generationhollow, paragon, gameplay, ..."
2,After,,[]
...,...,...,...
55,After,"Brexit,Boris Johnson,pm Johnson,Boris,Quitting...","[Brexit, Boris, Johnson, pm, Johnson, Boris, Q..."
55,Before,"james o'brien,theresa may,pm may,james o'brien...","[james, o'brien, theresa, may, pm, may, james,..."
57,After,"hip hop instrumental,hip hop instrumentals,jay...","[hip, hop, instrumental, hip, hop, instrumenta..."
57,Before,"trap beat,trap nation,n u a g e s closer,n u a...","[trap, beat, trap, nation, n, u, g, e, closer,..."


In [15]:
print("Assigning topics to each document")

# Assign the dominant topic to each document
def assign_dominant_topic(tokens, lda_model, dictionary):
    if not tokens or not isinstance(tokens, list):  # Handle empty or invalid tokens
        return None, None
    bow = dictionary.doc2bow(tokens)  # Convert tokens to bag-of-words format
    topic_probs = lda_model.get_document_topics(bow)  # Get topic distribution
    if topic_probs:
        dominant_topic, prob = max(topic_probs, key=lambda x: x[1])  # Most probable topic
        return dominant_topic, prob
    return None, None

df_small['Dominant_Topic'], df_small['Topic_Probability'] = zip(
    *df_small['Tokens'].apply(lambda tokens: assign_dominant_topic(tokens, lda, dictionary))
)

print(df_small.head(20))
df_small.to_csv('df_small_sample.csv', index=False)

Assigning topics to each document
                                                    Tags_combined  \
Decline Source                                                      
0       After   MsRosieBea\nMsRosieBea,back to uni,uni outfits...   
        Before  MsRosieBea,OUTFIT DIARIES\nMsRosieBea,red lip,...   
1       After   hollow,generationhollow,gameplay,review,guide,...   
        Before  hollow,generationhollow,paragon,gameplay,alpha...   
2       After                                                None   
        Before                                               None   
3       After   rust survival,living in a submarine,sub,base d...   
        Before  Player unkowns battleground,battleground,battl...   
4       After   game,PC game,computer,online,MORPEG,adventure,...   
        Before  experience,emotional,breyerfest,breyers,horses...   
5       After   Triple Entray,Phora,Drake,Eminem,Justin Bieber...   
        Before  Hip hop,Triple Entray,Drake,Eminem,Logic,Phora...   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['Dominant_Topic'], df_small['Topic_Probability'] = zip(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['Dominant_Topic'], df_small['Topic_Probability'] = zip(


We want to create two columns [Topic_change] and [Tokens_change] to determine if there is a difference between the tags before and after a decline. A change in tokens is used for granular analysis while a change is topics is more appropriate for detecting higher-level patterns.

In [16]:
df_small = df_small.dropna(subset=['Tokens', 'Dominant_Topic'])

# Pivot the dataset, keeping 'Dominant_topic' in a separate column
df_pivot = df_small.pivot_table(
    index='Decline',  # The index will be based on the 'Decline'
    columns='Source',  # We are splitting by 'Source' (Before and After)
    values=['Tokens', 'Dominant_Topic'],  # We want both Tokens and Dominant_topic in the pivoted table
    aggfunc={
        'Tokens': lambda x: ' '.join([item for sublist in x for item in sublist]),  # Flatten and join the tokens
        'Dominant_Topic': lambda x: x.mode()[0]  # Get the most frequent dominant topic (mode)
    }
)

df_pivot.head()

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens
Source,After,Before,After,Before
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,20.0,20.0,MsRosieBea MsRosieBea back uni uni outfit MsRo...,MsRosieBea OUTFIT DIARIES MsRosieBea red lip g...
1,54.0,54.0,hollow generationhollow gameplay review guide ...,hollow generationhollow paragon gameplay alpha...
3,10.0,10.0,rust survival living submarine sub base design...,Player unkowns battleground battleground battl...
4,34.0,34.0,game PC game computer online MORPEG adventure ...,experience emotional breyerfest breyers horse ...
5,4.0,4.0,Triple Entray Phora Drake Eminem Justin Bieber...,Hip hop Triple Entray Drake Eminem Logic Phora...


In [17]:
def token_change(tokens_before, tokens_after):
    # Ensure tokens are lists and not NaN or float
    if not isinstance(tokens_before, list):
        tokens_before = []
    if not isinstance(tokens_after, list):
        tokens_after = []
        
    # Compare sets of tokens
    set_before = set(tokens_before)
    set_after = set(tokens_after)
    return set_before != set_after  # Change if the sets are not identical

# Apply the token change function to compare the tokens before and after for each decline
df_pivot['Token_Change'] = df_pivot.apply(
    lambda row: token_change(row[('Tokens', 'Before')], row[('Tokens', 'After')]), axis=1)

# Assuming 'Dominant_topic' columns are available for 'Before' and 'After'
df_pivot['Topic_Change'] = df_pivot.apply(
    lambda row: row[('Dominant_Topic', 'Before')] != row[('Dominant_Topic', 'After')], axis=1)

# Verify the results
df_pivot.head()

Unnamed: 0_level_0,Dominant_Topic,Dominant_Topic,Tokens,Tokens,Token_Change,Topic_Change
Source,After,Before,After,Before,Unnamed: 5_level_1,Unnamed: 6_level_1
Decline,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,20.0,20.0,MsRosieBea MsRosieBea back uni uni outfit MsRo...,MsRosieBea OUTFIT DIARIES MsRosieBea red lip g...,False,False
1,54.0,54.0,hollow generationhollow gameplay review guide ...,hollow generationhollow paragon gameplay alpha...,False,False
3,10.0,10.0,rust survival living submarine sub base design...,Player unkowns battleground battleground battl...,False,False
4,34.0,34.0,game PC game computer online MORPEG adventure ...,experience emotional breyerfest breyers horse ...,False,False
5,4.0,4.0,Triple Entray Phora Drake Eminem Justin Bieber...,Hip hop Triple Entray Drake Eminem Logic Phora...,False,False


In [18]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda, texts=df_small['Tokens'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}') # 0.7475 with 55 topics, numwords = 9


Coherence Score: 0.7055009033053095
