# Latent Dirachlett Allocation Notebook

This notebook contains the LDA analysis that is within our datastory. If you installed the ```environment.yml``` then you will not need to download any additional packages. However if you did not you will need to download the following:

- ```pip install nltk```
- ```pip install gensim```
- ```pip install wordcloud```
- ```pip install spacy```
- ```pip install plotly```

#### Prerequisites

You will need to go to the ```src/data-explore``` notebook to generate our metadata. After doing so make sure you have a folder titled ```metadata_chunks``` within the ```datasets``` parent directory.

- ```datasets/metadata_chunks```

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'
import numpy as np
import sys 
import matplotlib.ticker as ticker
import os
import glob
import plotly.graph_objects as go 

np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_colwidth', None)

import sys
import spacy
import re
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from wordcloud import STOPWORDS
from wordcloud import WordCloud, STOPWORDS

from collections import Counter

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
nltk.download('wordnet')
nltk.download('omw-1.4')

import pyLDAvis.gensim_models

# user defined variables
gen_data = False
preprocessed = True

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/simonlee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/simonlee/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
if not preprocessed:
    path = '../datasets/metadata_chunks/'
    file_list = os.listdir(path)

In [3]:
if not preprocessed:
    os.chdir(path)

In [4]:
if not preprocessed:
    #list all csv files only
    print(os.getcwd())
    csv_files = glob.glob('*.csv.gz')

In [5]:
# only reads specific columns since not all information is necessary.
# also only reads in subset of current dataframes since it is super massive
if gen_data:
    df = pd.DataFrame()
    counter = 0

    # you can add more categories if we want to do more analysis. category acts as a filter when reading in the chunks of the json files
    #append all files together
    for file in csv_files:
        df_temp = pd.read_csv(file, usecols = ['description', 'title', 'upload_date'], engine='python')
        df = df.append(df_temp, ignore_index=True)
        counter += 1

        # we are only going to analyze 10,000,000 videos 
        if counter == 50:
            break

In [6]:
if not preprocessed:
    # we want to combine the title and description as one feature so we can do some time series analysis for our textual data
    df["video_info"] = df['title'].astype(str) +": "+ df["description"]
    # drop these columns to conserve space    
    df = df.drop(['title'],  axis=1)
    df = df.drop(['description'], axis=1)

    df['upload_date'] = pd.to_datetime(df['upload_date'], format='%Y-%m-%d').dt.date
    df['upload_date'] = pd.to_datetime(df['upload_date'], format='%Y-%m-%d')

    df['upload_date'] = df['upload_date'].dt.to_period('Y')
    df

In [7]:
def remove_urls(x):
    cleaned_string = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', str(x), flags=re.MULTILINE)
    return cleaned_string

def remove_symbols(x):
    cleaned_string = re.sub(r"[^a-zA-Z0-9]+", ' ', x)
    return cleaned_string 

def unify_whitespaces(x):
    cleaned_string = re.sub(' +', ' ', x)
    return cleaned_string 

def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)



In [8]:
if not preprocessed:
    df["video_info"] = df["video_info"].apply(remove_urls)
    df["video_info"] = df["video_info"].apply(remove_symbols)
    df["video_info"] = df["video_info"].apply(unify_whitespaces)
    df["video_info"] = df["video_info"].apply(deEmojify)
    df["video_info"] = df["video_info"].str.lower()

In [9]:
if not preprocessed:
    list_common = ['follow','twitter','facebook','times', 'news', 'journal', 'youtube', 'subscribe', 'channel', 'us', 'cbs', 'knoxnews', 'video', 'new', 'cbsnews', 'com', 'visit',
's', 'one', 'people', 'abc', 'cbs', 'year', 'week', 'day', 'years', 'daily', 'tonight', 'today', 'weeks', 'days', 'said', 'bloomberg', 'snapchat', 'instagram', 'media', 'press'
'government', 'tv', 'home', 'york', 'city', 'u_s', 'follow_us', 'new_york', 'cbs_news', 'cbsnews_com', 'first_look', 'repair_com', 'angry_news', 'evening_news', 'knoxnews_com',
'i_m', 'news_katie', 'he_s', 'bloomberg_news', 'york_times', 'voa_s', 'click_here', 'video_audio', 'view_article', 'times_video', 'that_s', 'part_1', 'part_2', 'please_visit',
'us_twitter', 'abc_australia', 'york_city', 'obama_s', 'in_depth', 'hollywood_tv', 'youtube_com', 'video_report', 'visit_www', 'produced_abc', 'video_at', 'e_mail', 'period_com',
'they_re', 'bloomberg_interview', 'year_old', 'years_old', 'like_us', 'dunya_tv', 'samaa_tv', 'rt_twitter', 'monday_friday', 'today_s', 'what_s', 's_important', 'upload_answer',
'kmbc_s', 'cbs_evening', 'facebook_follow', 'us_facebook', 'visit_wsj', 'wsj_com', 'video_subscribe', 'facebook_join', 'wsj', 'produced', 'channel', 'see', 'talks', 'radio', 'p', 'u',
'covering', 'coverage' ]
    df_news = pd.read_csv('/Users/simonlee/ada-2022-project-chromegoldfish/datasets/news_channels.csv.gz')
    df_news['mainstream news organizations'] = df_news['mainstream news organizations'].str.lower()
    news_list = df_news['mainstream news organizations'].values.tolist()

    # Creating a custom list of stopwords
    customStopwords=list(STOPWORDS) + list_common + news_list

    df["video_info"] = df["video_info"].apply(lambda x: ' '.join([word for word in x.split() if word not in (customStopwords)]))

In [10]:
if not preprocessed:
    df["upload_date"] = df["upload_date"].astype(str)

In [11]:
if not preprocessed: 
    # get all the year-month combinations found from our metadata information
    date_list = df.upload_date.unique()
    date_list = sorted(date_list)

In [12]:
if not preprocessed:
    os.chdir("/Users/simonlee/ada-2022-project-chromegoldfish/datasets")

In [13]:
if not preprocessed:
    from pathlib import Path

    path = './year/FileName.csv.gz' # path to write new data
    # we split all metadata into weeks so we can do a time series analysis per week
    for date in date_list:
        temp = df[df['upload_date'] == date] 
        p = Path(path).parent.joinpath(f"{date}_videos.csv.gz")
        print(p)
        temp.to_csv(p, compression='gzip')
        print(date, "is done")

# If Data is already preprocessed start here

In [14]:
path = '/Users/simonlee/ada-2022-project-chromegoldfish/datasets/year'
file_list = os.listdir(path)

In [15]:
os.chdir(path)

In [16]:
csv_files = glob.glob('*.csv.gz')
csv_files = sorted(csv_files)

In [17]:

df = pd.DataFrame()

# you can add more categories if we want to do more analysis. category acts as a filter when reading in the chunks of the json files
#append all files together
for file in csv_files:
    df_temp = pd.read_csv(file)
    df = df.append(df_temp, ignore_index=True)

  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)
  df = df.append(df_temp, ignore_index=True)


In [18]:
df = df.dropna()

In [19]:
print(df.shape)

(4793822, 3)


In [20]:
df = df.sample(n = 2500000)

In [21]:
df['video_info'] = df['video_info'].astype(str)

In [22]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    return ' '.join(words)

In [23]:
df['video_info'] = df.video_info.apply(lemmatize_text)

In [24]:
more_stop_words = ['use', 'type', 'free', 'videos', 'copyright', 'live', 'subscribers', 'vs', 'de', 'support', 'th', 'click', 'link', 'will', 'want', 'don', 'hope',
'know', 'check', 'thanks', 'thank', 'free', 'much', 'enjoy', 're', 'share', 'go', 'comment', 'sure', 'watching', 'll', 'let', 'find', 'need', 'every', 
'life', 'will', 'now', 'too', 'first', 'old', 'back', 'part', 'last', 'three', 'still', 'way', 'take', 'young', 'night', 'many', 'well', 'may', 'set', 'will', 'dark', 'unique', 'join',
'free', 'download', 'th', 'utm', 'using', 'watch', 'based', 'size', 'set', 'medium', 'case', 'program', 'led', 'center', 'grow', 'part', 'let', 'plays', 'super', 'mode', 'vs', 'pack', 
'use', 'email', 'affiliate', 'blog', 'gmail', 'pro', 'check', 'free', 'canon', 'show', 'website', 'experience', 'provided', 'release', 'released', 'feat', 'ft', 'support', 'generated', 
'join', 'gb', 'check', 'pro', 'outro', 'updates', 'khan', 'coming', 'best', "soon", 'website', 'salman', 'rv', 'content', 'hd', 'expert', 'promo', 'easy', 'use', 'copyright', 'fair',
'purposes', 'disclaimer', 'please', 'section', 'made', 'rights', 'non', 'reporting', 'intended', 'tips', 'might', 'watch', 'permitted', 'link', 'kevin', 'ntv', 'commons', 'attribution', 
'common', 'macleod', 'co', 'little', 'unboxing', 'top', 'access', 'ref', 'rs', 'now', 'hp', 'air', 'sr', 'network', 'leading', 'cm', 'well', 'subscribers', 'sms', 'dial', 'call', 'full', 'high',
'now', 'enjoy', 'label', 'connected', 'circle', 'bt', 'watch', 'series', 'production', 'th', 'pm', 'live', 'national', 'entertainment', 'scenes', 'scene', 'comedy', 'friday', 'july',
'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'watch', 'official', 'st', 'event', 'latest', 'march', 'action', 'june', 'january', 'february', 'april',
'august', 'september', 'october', 'november', 'december', 'cohh', 'online', 'live', 'full', 'network', 'natural', 'around', 'etc', 'number', 'click', 'technical', 'course', 'analysis',
'live', 'latest', 'business', 'exam', 'test', 'know', 'free', 'watch', 'world', 'start', 'demo', 'grade', 'opinion', 'breaking', 'connect', 'information', 'will', 'press', 'watching',
'keep', 'updates', 'watch', 'public', 'life', 'latest', 'metal', 'brings', 'plus', 'thing', 'best', 'content', 'round', 'official', 'tank', 'build', 'videos', 'premium', 'bin', 'id',
'movie', 'cup', 'cmd', 'webscr', 'hosted', 'add', 'button', 'used', 'parts', 'free', 'wr', 'find', 're', 'looking', 'morning', 'highlight', 'good', 'creed', 'website', 'many', 'fi', 'areas',
'type', 'de', 'la', 'en', 'en', 'el', 'prod', 'un', 'young', 'da', 'del', 'es', 'il', 'los', 'las', 'black', 'al', 'tour', 'course', 'newsx', 'chart', 'mk', 'cricket', 'find', 'para', 'con',
'alive', 'que', 'gonna', 'moments', 'short', 'ets', 'tu', 'hot', 'scary', 'intro', 'hair', 'two', 'work', 'right', 'even', 'make', 'great', 'long', 'help', 'different', 've', 'end', 'side',
'never', 'another', 'place', 'sounds', 'without', 'look', 'found', 'put', 'shows', 'must', 'let', 'make', 'got', 'big', 'things', 'real', 'box', 'really', 'amazing', 'think', 'po', 've', 'come',
'oh','everyone', 'never', 'merch', 'make', 'leave', 'links', 'page', 'forget', 'enjoyed', 'page', 'send', 'intro', 'hit', 'subscribe', 'awesome', 'comments', 'group', 'shop', 'games', 'ps',
'mod', 'playing', 'level', 'dead', 'story', 'final', 'lil', 'hip', 'hop', 'love', 'studio', 'learn', 'dr', 'questions', 'community', 'help', 'guide', 'advice', 'talk', 'work', 'school', 'read',
'contact', 'social', 'tools', 'training', 'financial', 'available', 'links', 'gear', 'buy', 'mm', 'act', 'personal', 'kit', 'small', 'personal', 'criticism', 'balance', 'make', 'light', 'allowance',
'two', 'truth', 'going', 'give', 'age', 'hour', 'present', 'sense', 'born', 'step', 'color', 'tutorial', 'make', 'white', 'powder', 'pink', 'face', 'white', 'blue', 'orange', 
'brown', 'man', 'woman', 'reaction', 'nation', 'says', 'ops', 'county', 'former', 'john', 'david', 'james', 'state', 'mr', 'fight', 'weather', 'court', 'paul', 'yt', 'blast', 'mut', 'chris',
'grand', 'ace', 'trick', 'design', 'farm', 'trove', 'mike', 'jack', 'source', 'mouse', 'revealed', 'rise', 'iron', 'class', 'south', 'east', 'west', 'north', 'chicken', 'pt', 'wall', 'bill'
'make', 'original', 'thomas', 'steve', 'available', 'uploads', 'upload', 'inc', 'stay', 'sam', 'car', 'deep', 'big', 'river', 'mike', 'van', 'speech', 'interior', 'service', 'creative',
'project', 'licensed', 'train', 'mark', 'united', 'motion', 'vol', 'editing', 'background', 'premiere', 'sa', 'page', 'projects', 'effects', 'training', 'ig', 'angeles', 'cc', 'read',
'among', 'stories', 'successful', 'episode', 'playlist', 'party', 'quality', 'walks', 'owned', 'detail', 'magic', 'details', 'globe', 'ltd', 'sri', 'matched', 'match', 'digital', 'draw',
'stay', 'toy', 'exclusive', 'power', 'surprise', 'update', 'pool', 'tuned', 'blind', 'interview', 'interesting', 'coach', 'awards', 'eggs', 'toys', 'corner', 'strength', 'welcome', 'rights',
'right', 'interview', 'left', 'exclusive', 'consider', 'jam', 'tunes', 'forget', 'fantasy', 'help', 'finger', 'posts', 'playlists', 'playlist', 'tv', 'ball', 'core', 'ram', 'ghz', 'gtx',
'auto', 'raw', 'weight', 'stay', 'date', 'dash', 'tin', 'level', 'exams', 'courses', 'ssc', 'current', 'covered', 'ias', 'sales', 'prelims', 'success', 'clip', 'rate', 'ten', 'stay',
'net', 'drive', 'mills', 'auto', 'help', 'state', 'location', 'class', "cars", "crash", 'auto', 'mills', 'help', 'state', 'factory', 'location', 'class', 'intro', 'status', 'template',
'ya', 'yeah', 'bol', 'ben', 'na', 'nan', 'jason', 'daniel', 'jones', 'mp', 'effects', 'tony', 'ray', 'am', 'pm', 'art' 'se', 'prime', 'hai', 'nail',' green', 'ka', 'ke', 'tricks', 'pet',
'ko', 'modi', 'brick', 'get', 'feel', 'try', 'lot', 'say', 'tell', 'turn', 'ask', 'tip']

In [25]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

In [26]:
data = df.video_info.values.tolist()
data_words = list(sent_to_words(data))

In [27]:
del df

In [28]:
from gensim.models.phrases import Phrases
bigram = Phrases(data_words, min_count=75)

for idx in range(len(data_words)):
    for token in bigram[data_words[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            data_words[idx].append(token)

In [29]:
data_words = [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in more_stop_words ] for doc in data_words] 

In [31]:
# Create Dictionary
dictionary = corpora.Dictionary(data_words)
print("finished constructing dictionary")

# Create Corpus: Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in data_words]
print("finished constructing corpus")

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                        id2word=dictionary,
                                        num_topics=15, # hyperparameter we can change. For now I chose 15 topics for the 15 youtube categoreis
                                        random_state=100,
                                        update_every=1,
                                        passes=10,
                                        alpha='symmetric',
                                        iterations=100,
                                        per_word_topics=True)
print("finished constructing model")

finished constructing dictionary
finished constructing corpus
finished constructing model


In [33]:
import pickle
with open('../dictionary.pkl', 'wb') as pickle_file:
    pickle.dump(dictionary, pickle_file)
with open('../corpus.pkl', 'wb') as pickle_file2:
    pickle.dump(corpus, pickle_file2)
with open('../model.pkl', 'wb') as pickle_file3:
    pickle.dump(lda_model, pickle_file3)    


In [6]:
import pickle 
with open('../datasets/dictionary.pkl', 'rb') as pickle_file:
    dictionary = pickle.load(pickle_file)
with open('../datasets/corpus.pkl', 'rb') as pickle_file2:
    corpus = pickle.load(pickle_file2)
with open('../datasets/model.pkl', 'rb') as pickle_file3:   
    lda_model = pickle.load(pickle_file3)

In [8]:
# plot topics
data =  pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(data, '../../lda4.html')

  from imp import reload
  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [9]:
pyLDAvis.display(data)