# Data loading

In [None]:
import ijson
import pandas as pd
import pyarrow.feather as feather
from nltk.sentiment import SentimentIntensityAnalyzer

from tqdm import tqdm

In [None]:
data_path = 'data/'

## Count negative words

In [None]:
# Count negative words in titles

f = open(data_path + 'yt_metadata_en.jsonl')

neg_words = set(open('negative-words.txt', mode='r', encoding='iso-8859-1').read().strip().split("\n"))

titles = ijson.items(f, 'title', multiple_values=True)

count_negative_words = dict()
count_words = dict()

for title in tqdm(titles):
    title_words = set(title.split(' '))
    nb_negative = len(title_words.intersection(neg_words))
    nb_words = len(title_words)
    count_negative_words[title] = nb_negative
    count_words[title] = nb_words
print('Finished')

f.close()

In [None]:
# Look at the result
count_negative_words

In [None]:
# Look at titles where there are many negative words (>= 5)
threshold = 5
count_neg_large = {title: count for title, count in count_negative_words.items() if count >= threshold}
count_neg_large

## Sentiment analysis computation

In [None]:
# Compute sentiment analysis in titles

f = open(data_path + 'yt_metadata_en.jsonl')
titles = ijson.items(f, 'title', multiple_values=True)
sia = SentimentIntensityAnalyzer()

neg_neu_pos = dict()

for title in tqdm(titles):
    negative, neutral, positive, compound = sia.polarity_scores(title).values()
    neg_neu_pos[title] = (negative, neutral, positive)
    
print('finished')

f.close()

In [None]:
# Print the most negative titles
sorted(neg_neu_pos.items(), key= lambda x: -x[1][0])