# Data loading

In [None]:
import ijson
import pandas as pd
import pyarrow.feather as feather
from nltk.sentiment import SentimentIntensityAnalyzer

from tqdm import tqdm

The file `yt_metadata_en.jsonl.gz` should be unzipped (~ 98 Go) in the data directory. 

In [None]:
data_path = 'data/'

## Process the big json file to a csv

Store the columns we need in `yt_metadata_en.jsonl` (all but `title`, `tags`, `description`) in a CSV file

In [None]:
# Create a csv with only the columns to keep (drop title, description and tags)

f = open(data_path + 'yt_metadata_en.jsonl')
output = open('generated/videos_few_columnstmp.csv', 'w')

videos = ijson.items(f, '', multiple_values=True)

columns_to_drop = [
    'description', 
    'tags', 
    'title'
]
    
columns_to_keep = [
    'categories', 
    'channel_id', 
    'crawl_date', 
    'dislike_count', 
    'display_id', 
    'duration', 
    'like_count',
    'upload_date',
    'view_count'

]

columns = [f'"{str(col)}"' for col in columns_to_keep]
output.write(','.join(columns))
output.write('\n')

for video in tqdm(videos):
    
    data_to_write = [str(video[field]) for field in columns_to_keep]
    output.write(','.join(data_to_write))
    output.write('\n')
    
    
print('Finished') # 11min05s
output.close()
f.close()

In [None]:
%%time
types_col = {    
    'categories': 'string', 
    'channel_id': 'string', 
    'crawl_date': 'string', 
    'dislike_count': 'Int32',
    'display_id': 'string', 
    'duration': 'string', 
    'like_count': 'Int64',
    'upload_date': 'string', 
    'view_count': 'Int64'
}
v = pd.read_csv(
    'generated/videos_few_columns.csv', 
    sep=',', 
    header=0, 
    dtype=types_col, 
    na_values=['None'], 
    engine='c'
) # 6min36s [[Djian: It might be faster using pyarrow csv]]

In [None]:
%%time
v.to_parquet('generated/videos_few_columns.parquet', compression=None) # 4min14s

In [None]:
%%time
w = pd.read_parquet('generated/videos_few_columns.parquet', engine='fastparquet') # 37s

## Count negative words

In [None]:
# Helper function to count the number of negative words in a text

neg_words = set(open('negative-words.txt', mode='r', encoding='iso-8859-1').read().strip().split("\n"))

def count_neg_words(text):
    words = set(word.lower() for word in text.split(' '))
    nb_negative = len(words.intersection(neg_words))
    nb_words = len(words)
    return nb_negative, nb_words

In [None]:
# Count the negative words in titles (output: list of dictionaries, one dict for each video)

f = open(data_path + 'yt_metadata_en.jsonl')
videos = ijson.items(f, '', multiple_values=True)

list_new_data = []

for video in tqdm(videos):    
    
    # Count negative words in title
    count_neg_words_title, count_words_title = count_neg_words(video['title'])
    
    list_new_data.append({
        'count_words_title': count_words_title,
        'count_negative_words_title': count_neg_words_title
    })
    
print('Finished') # 11min26s
f.close()

In [None]:
# Convert the list of dicts to a dataframe
%%time
newcols = pd.DataFrame.from_dict(list_new_data) # ~5min

In [None]:
# Join the dataframe of videos with the newly copmuted columns
%%time
joined = w.join(newcols) # 1min53s

In [None]:
# Write the joined dataframe to a file
%%time 
joined.to_parquet('generated/videos_CountNegWordsTitle.parquet', compression=None) # 3min35s

## Sentiment analysis computation

In [None]:
# Compute sentiment analysis in titles (output: list of dictionaries, one dict for each video)

f = open(data_path + 'yt_metadata_en.jsonl')
videos = ijson.items(f, '', multiple_values=True)

sia = SentimentIntensityAnalyzer()

list_new_data = []
for video in tqdm(videos):    
    
    # Compute sentiment for the title
    negative, neutral, positive, compound = sia.polarity_scores(video['title']).values()

    list_new_data.append({
        'sia_negative': negative,
        'sia_neutral': neutral,
        'sia_positive': positive,
        'sia_compound': compound
    })
    
print('Finished') # [[Djian: It will probably take a bit less than 2 hours]]
f.close()

In [None]:
# [[...]]