In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import decomposition
import re
import string

videos_data = pd.read_csv('data/videos_around_declines.csv')

video_tags = videos_data['tags'].astype(str)

video_tags

0          video,games,retrogamer3,ed,findlay,Scam,Steam,...
1          video,games,retrogamer3,ed,findlay,Trump,Ameri...
2          video,games,retrogamer3,ed,findlay,America's R...
3                                 MTG Arena War of the Spark
4          video,games,retrogamer3,ed,findlay,Mpow,Headph...
                                 ...                        
1905541    BJP,Bharatiya Janata Party,BJP videos,Yuva TV,...
1905542    BJP,Bharatiya Janata Party,BJP videos,Yuva TV,...
1905543    BJP,Bharatiya Janata Party,BJP videos,Yuva TV,...
1905544    BJP,Bharatiya Janata Party,BJP videos,Yuva TV,...
1905545    BJP,Bharatiya Janata Party,BJP videos,Yuva TV,...
Name: tags, Length: 1905546, dtype: object

In [24]:
lemmatizer = WordNetLemmatizer()
english_stopwords = stopwords.words('english')

def clean_tag(s):
    without_punctuation = re.sub(f'[{re.escape(string.punctuation)}]', '', s.lower().replace(',', ' '))
    without_stopwords = " ".join([word for word in without_punctuation.split() if word not in english_stopwords])
    lemmatized = lemmatizer.lemmatize(without_stopwords)
    return lemmatized

unique_tags = video_tags.apply(lambda x: clean_tag(x)).explode().unique()

# Vectorize the tags
tfv = TfidfVectorizer()
vectorized_tags = tfv.fit_transform(unique_tags)

# Perform SVD on the tags to get the topics
svd = decomposition.TruncatedSVD(n_components=10)
tags_svd = svd.fit_transform(vectorized_tags)

# Get the topic scores
topic_scores = dict(
    zip(
        tfv.get_feature_names_out(),
        tags_svd[0]
    )
)

topic_output = sorted(
    topic_scores, key=topic_scores.get, reverse=True
)

# Print the topics in order of importance
topic_output

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nathangromb/nltk_data...


['000',
 '00',
 '00000000000',
 '000001',
 '00000000',
 '000000000',
 '00000',
 '0000000',
 '000000',
 '0000']