In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import ldamodel
from tqdm import tqdm
from gensim import corpora
from src.utils.recovery_analysis_utils import str_to_list

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

decline_events = pd.read_csv('data/sampled_decline_events_with_videos.csv')
videos = pd.read_csv('data/videos_around_declines.csv')

decline_events['Videos_before'] = decline_events['Videos_before'].apply(str_to_list)
decline_events['Videos_after'] = decline_events['Videos_after'].apply(str_to_list)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# Create a data_frame with 2 index: the index of the decline and the source (before and after)

df_before = decline_events[['Videos_before']].explode('Videos_before')
df_before['Source'] = 'Before'
df_before = df_before.rename(columns={'Videos_before': 'Video'})

df_after = decline_events[['Videos_after']].explode('Videos_after')
df_after['Source'] = 'After'
df_after = df_after.rename(columns={'Videos_after': 'Video'})

df_tags = pd.concat([df_before, df_after], axis=0).reset_index().rename(columns={'index': 'Decline'})
df_tags = df_tags.set_index(['Decline', 'Source'])

df_tags.sort_values(by = ['Decline', 'Source'])
df_tags = df_tags.dropna()

In [12]:
# Map to obtain the tags of all videos for each video before and after decline
df_tags['Tags'] = df_tags['Video'].map(lambda video: videos.loc[video, 'tags'] if video in videos.index else None)
df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Video,Tags
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Before,1684989,MsRosieBea
0,Before,1684990,"MsRosieBea,primark haul,primark haul august,pr..."
0,Before,1684991,MsRosieBea
0,Before,1684992,MsRosieBea
0,Before,1684993,"MsRosieBea,red lip,get ready with me"
...,...,...,...
36598,After,1889699,"Music,beats,instrumental,right beat radio,stra..."
36598,After,1889700,"Music,beats,instrumental,right beat radio,late..."
36598,After,1889701,"Music,beats,instrumental,right beat radio,lofi..."
36598,After,1889702,"Music,beats,instrumental,right beat radio,mell..."


In [13]:
# Get for each decline only 2 rows with the tags corresponding to the before and the after, handling NaNs and non-list values
df_tags = df_tags.groupby(['Decline', 'Source'])['Tags'].apply(
    lambda x: list(set([item for sublist in x.dropna() for item in (sublist if isinstance(sublist, list) else [sublist])]))
).reset_index(name='Tags_combined')

df_tags.set_index(['Decline', 'Source'], inplace=True)

# Map the tags to a string, separating them by new lines
df_tags['Tags_combined'] = df_tags['Tags_combined'].map(lambda tags: '\n'.join(tags) if tags else None)

df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined
Decline,Source,Unnamed: 2_level_1
0,After,"MsRosieBea,back to uni,uni outfits\nMsRosieBea..."
0,Before,"MsRosieBea,fashion intern,what to wear to a fa..."
1,After,"hollow,generationhollow,playthrough,blind play..."
1,Before,"hollow,generationhollow,playthrough,blind play..."
2,After,
...,...,...
36595,Before,"Despacito accordion cover,Fonsi Despacito acco..."
36597,After,"Keith Fenner,Fenner,machine shop,Turn WrightMa..."
36597,Before,"Wilton,Belt sanders,Keith Fenner,Fenner,machin..."
36598,After,"Music,beats,instrumental,right beat radio,chil..."


In [14]:
CASEFOLD = False

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Tokenize and lemmatize the tags, removing stop words and punctuation
def preprocess_str(s):
    if not s:
        return s

    tokens = word_tokenize(s.lower() if CASEFOLD else s, language='english', preserve_line=True)
    tokens = [[t for t in token_list if t not in stop_words] for token_list in tokens]
    tokens = [[lemmatizer.lemmatize(t) for t in token_list] for token_list in tokens]

    return tokens

print("Tokenizing and lemmatizing tags")
df_tags['Tokens'] = None
for index, row in tqdm(df_tags.iterrows(), total=df_tags.shape[0]):
    df_tags.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])

# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_tags['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_tags['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

df_tags

Tokenizing and lemmatizing tags


100%|██████████| 61194/61194 [13:11<00:00, 77.29it/s]  


Creating dictionary and corpus


TypeError: decoding to str: need a bytes-like object, list found

In [20]:
print(df_tags['Tokens'].head(10))


# PROBLEM: The lists are embedded, only need one big list per double index (decline + source)
# The flattenig should work but need to remove the rows that have None

# Replace None or NaN in Tokens with empty lists
df_tags['Tokens'] = df_tags['Tokens'].apply(
    lambda x: [] if pd.isnull(x) or not isinstance(x, list) else x
)

# Flatten any nested lists in Tokens
df_tags['Tokens'] = df_tags['Tokens'].apply(
    lambda tokens: [item for sublist in tokens for item in sublist] if any(isinstance(i, list) for i in tokens) else tokens
    if isinstance(tokens, list) else []
)

# Check the cleaned Tokens column
print(df_tags['Tokens'].head())



Decline  Source
0        After     [[M, R, e, B, e], [,], [b, c, k], [], [u, n], ...
         Before    [[M, R, e, B, e], [,], [f, h, n], [n, e, r, n]...
1        After     [[h, l, l, w], [,], [g, e, n, e, r, n, h, l, l...
         Before    [[h, l, l, w], [,], [g, e, n, e, r, n, h, l, l...
2        After                                                  None
         Before                                                 None
3        After     [[r, u], [,], [g, e, p, l], [,], [u, p, e], [,...
         Before    [[R, u], [,], [r], [,], [e, c], [r], [,], [r, ...
4        After     [[S, p, r], [,], [S, p, r], [R, n, g], [F, r, ...
         Before    [[p, n], [,], [p, n, e], [,], [h, r, e], [,], ...
Name: Tokens, dtype: object


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()