In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import ldamodel
from tqdm import tqdm
from gensim import corpora
from src.utils.recovery_analysis_utils import str_to_list

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

decline_events = pd.read_csv('data/sampled_decline_events_with_videos.csv')
videos = pd.read_csv('data/videos_around_declines.csv')

decline_events['Videos_before'] = decline_events['Videos_before'].apply(str_to_list)
decline_events['Videos_after'] = decline_events['Videos_after'].apply(str_to_list)

In [2]:
# Create a data_frame with 2 index: the index of the decline and the source (before and after)

df_before = decline_events[['Videos_before']].explode('Videos_before')
df_before['Source'] = 'Before'
df_before = df_before.rename(columns={'Videos_before': 'Video'})

df_after = decline_events[['Videos_after']].explode('Videos_after')
df_after['Source'] = 'After'
df_after = df_after.rename(columns={'Videos_after': 'Video'})

df_tags = pd.concat([df_before, df_after], axis=0).reset_index().rename(columns={'index': 'Decline'})
df_tags = df_tags.set_index(['Decline', 'Source'])

df_tags.sort_values(by = ['Decline', 'Source'])
df_tags = df_tags.dropna()

In [3]:
# Map to obtain the tags of all videos for each video before and after decline
df_tags['Tags'] = df_tags['Video'].map(lambda video: videos.loc[video, 'tags'] if video in videos.index else None)
df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Video,Tags
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Before,1684989,MsRosieBea
0,Before,1684990,"MsRosieBea,primark haul,primark haul august,pr..."
0,Before,1684991,MsRosieBea
0,Before,1684992,MsRosieBea
0,Before,1684993,"MsRosieBea,red lip,get ready with me"
...,...,...,...
36598,After,1889699,"Music,beats,instrumental,right beat radio,stra..."
36598,After,1889700,"Music,beats,instrumental,right beat radio,late..."
36598,After,1889701,"Music,beats,instrumental,right beat radio,lofi..."
36598,After,1889702,"Music,beats,instrumental,right beat radio,mell..."


In [4]:
# Get for each decline only 2 rows with the tags corresponding to the before and the after, handling NaNs and non-list values
df_tags = df_tags.groupby(['Decline', 'Source'])['Tags'].apply(
    lambda x: list(set([item for sublist in x.dropna() for item in (sublist if isinstance(sublist, list) else [sublist])]))
).reset_index(name='Tags_combined')

df_tags.set_index(['Decline', 'Source'], inplace=True)

# Map the tags to a string, separating them by new lines
df_tags['Tags_combined'] = df_tags['Tags_combined'].map(lambda tags: '\n'.join(tags) if tags else None)

df_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined
Decline,Source,Unnamed: 2_level_1
0,After,"MsRosieBea,21st birthday,birthday,ring,jewelle..."
0,Before,"MsRosieBea,uni work,studying fashion design,fa..."
1,After,"hollow,generationhollow,playthrough,blind play..."
1,Before,"hollow,generationhollow,playthrough,blind play..."
2,After,
...,...,...
36595,Before,"Despacito accordion cover,Fonsi Despacito acco..."
36597,After,"Shaper,Clapper,Keith Fenner,Fenner,machine sho..."
36597,Before,"Bridgeport,Stainless Steel Placards,Roller Kit..."
36598,After,"Music,beats,instrumental,right beat radio,minn..."


In [29]:
CASEFOLD = False

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Tokenize and lemmatize the tags, removing stop words and punctuation
def preprocess_str(s):
    if not s:
        return s

    tokens = word_tokenize(s.lower() if CASEFOLD else s, language='english', preserve_line=True)
    tokens = [[t for t in token_list if t not in stop_words] for token_list in tokens]
    tokens = [[lemmatizer.lemmatize(t) for t in token_list] for token_list in tokens]

    return tokens


def preprocess_str(s):
    if not isinstance(s, str) or not s.strip():
        return []  # Return an empty list for invalid inputs
    tokens = word_tokenize(s.lower() if CASEFOLD else s, preserve_line=True)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

In [5]:
print("Tokenizing and lemmatizing tags")
df_tags['Tokens'] = None
for index, row in tqdm(df_tags.iterrows(), total=df_tags.shape[0]):
    df_tags.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])


# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_tags['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_tags['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

df_tags

Tokenizing and lemmatizing tags


100%|██████████| 61194/61194 [27:20<00:00, 37.30it/s]  


Creating dictionary and corpus


TypeError: decoding to str: need a bytes-like object, list found

In [18]:
#print(df_tags['Tokens'].head(10))


# PROBLEM: The lists are embedded, only need one big list per double index (decline + source)
# The flattenig should work but need to remove the rows that have None

# Replace None or NaN in Tokens with empty lists
df_tags['Tokens'] = df_tags['Tokens'].apply(
    lambda x: [] if x is None else x
)


# Flatten any nested lists in Tokens
df_tags['Tokens'] = df_tags['Tokens'].apply(
    lambda tokens: [item for sublist in tokens for item in sublist] if any(isinstance(i, list) for i in tokens) else tokens
    if isinstance(tokens, list) else []
)

# Check the cleaned Tokens column
print(df_tags['Tokens'])

Decline  Source
0        After     [M, R, e, B, e, ,, 2, 1, b, r, h, ,, b, r, h, ...
         Before    [M, R, e, B, e, ,, u, n, w, r, k, ,, u, n, g, ...
1        After     [h, l, l, w, ,, g, e, n, e, r, n, h, l, l, w, ...
         Before    [h, l, l, w, ,, g, e, n, e, r, n, h, l, l, w, ...
2        After                                                    []
                                         ...                        
36595    Before    [D, e, p, c, c, c, r, n, c, v, e, r, ,, F, n, ...
36597    After     [S, h, p, e, r, ,, C, l, p, p, e, r, ,, K, e, ...
         Before    [B, r, g, e, p, r, ,, S, n, l, e, S, e, e, l, ...
36598    After     [M, u, c, ,, b, e, ,, n, r, u, e, n, l, ,, r, ...
         Before    [M, u, c, ,, b, e, ,, n, r, u, e, n, l, ,, r, ...
Name: Tokens, Length: 61194, dtype: object


In [15]:
df_tags['Tokens'].apply(type).value_counts()

Tokens
<class 'list'>        57519
<class 'NoneType'>     3675
Name: count, dtype: int64

In [17]:
# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_tags['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_tags['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

df_tags

Creating dictionary and corpus
Training LDA model
(0, '0.071*"," + 0.047*"а" + 0.043*"и" + 0.040*"о" + 0.035*"е"')
(1, '0.117*"," + 0.112*"e" + 0.069*"n" + 0.068*"r" + 0.043*"l"')
(2, '0.132*"1" + 0.112*"2" + 0.091*"0" + 0.068*"," + 0.058*"e"')
(3, '0.097*"E" + 0.090*"A" + 0.075*"I" + 0.070*"O" + 0.069*","')
(4, '0.165*"e" + 0.134*"," + 0.107*"n" + 0.107*"r" + 0.084*"l"')


Unnamed: 0_level_0,Unnamed: 1_level_0,Tags_combined,Tokens
Decline,Source,Unnamed: 2_level_1,Unnamed: 3_level_1
0,After,"MsRosieBea,21st birthday,birthday,ring,jewelle...","[M, R, e, B, e, ,, 2, 1, b, r, h, ,, b, r, h, ..."
0,Before,"MsRosieBea,uni work,studying fashion design,fa...","[M, R, e, B, e, ,, u, n, w, r, k, ,, u, n, g, ..."
1,After,"hollow,generationhollow,playthrough,blind play...","[h, l, l, w, ,, g, e, n, e, r, n, h, l, l, w, ..."
1,Before,"hollow,generationhollow,playthrough,blind play...","[h, l, l, w, ,, g, e, n, e, r, n, h, l, l, w, ..."
2,After,,[]
...,...,...,...
36595,Before,"Despacito accordion cover,Fonsi Despacito acco...","[D, e, p, c, c, c, r, n, c, v, e, r, ,, F, n, ..."
36597,After,"Shaper,Clapper,Keith Fenner,Fenner,machine sho...","[S, h, p, e, r, ,, C, l, p, p, e, r, ,, K, e, ..."
36597,Before,"Bridgeport,Stainless Steel Placards,Roller Kit...","[B, r, g, e, p, r, ,, S, n, l, e, S, e, e, l, ..."
36598,After,"Music,beats,instrumental,right beat radio,minn...","[M, u, c, ,, b, e, ,, n, r, u, e, n, l, ,, r, ..."


## Test of the preprocessing with a small dataset

In [24]:
df_small = df_tags.head(100)
print(df_small.shape)

(100, 2)
                                                    Tags_combined  \
Decline Source                                                      
0       After   MsRosieBea,21st birthday,birthday,ring,jewelle...   
        Before  MsRosieBea,uni work,studying fashion design,fa...   
1       After   hollow,generationhollow,playthrough,blind play...   
        Before  hollow,generationhollow,playthrough,blind play...   
2       After                                                None   

                                                           Tokens  
Decline Source                                                     
0       After   [M, R, e, B, e, ,, 2, 1, b, r, h, ,, b, r, h, ...  
        Before  [M, R, e, B, e, ,, u, n, w, r, k, ,, u, n, g, ...  
1       After   [h, l, l, w, ,, g, e, n, e, r, n, h, l, l, w, ...  
        Before  [h, l, l, w, ,, g, e, n, e, r, n, h, l, l, w, ...  
2       After                                                  []  


In [30]:
print("Tokenizing and lemmatizing tags")
df_small['Tokens'] = None
for index, row in tqdm(df_small.iterrows(), total=df_small.shape[0]):
    df_small.at[index, 'Tokens'] = preprocess_str(row['Tags_combined'])


# Create a dictionary and a corpus for the LDA model
print("Creating dictionary and corpus")
dictionary = corpora.Dictionary(df_small['Tokens'])
corpus = [dictionary.doc2bow(token_list) for token_list in df_small['Tokens']]

print("Training LDA model")
lda = ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

df_small

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['Tokens'] = None


Tokenizing and lemmatizing tags


100%|██████████| 100/100 [00:01<00:00, 94.72it/s]

Creating dictionary and corpus





TypeError: 'NoneType' object is not iterable

In [28]:
df_small['Tokens'].apply(type).value_counts()

Tokens
<class 'list'>        86
<class 'NoneType'>    14
Name: count, dtype: int64