# Data processing

In [None]:
import csv
import ijson
import pandas as pd
import pyarrow.feather as feather
from nltk.sentiment import SentimentIntensityAnalyzer

from tqdm import tqdm

The file `yt_metadata_en.jsonl.gz` should be unzipped (~ 98 Go) in the data directory. 

In [None]:
data_path = 'data/'

### Process the big json file to small CSVs (one for each year) and then parquet

Store the columns we need in `yt_metadata_en.jsonl` (all but `title`, `tags`, `description`) in a separate CSV file for each year

In [None]:
# Years from 2005 to 2019 (included)

#years = [str(year) for year in range(2005, 2020)]
years = ['2019'] # We only use 2019 at the moment, for simplicity.

In [None]:
# Create CSVs with only the columns to keep (drop title, description and tags)

f = open(data_path + 'yt_metadata_en.jsonl')
videos = ijson.items(f, '', multiple_values=True)

output = {year: open(f'generated/{year}/{year}_videos_few_columns.csv', 'w') for year in years}

columns_to_drop = [
    'description', 
    'tags', 
    'title'
]
columns_to_keep = [
    'categories', 
    'channel_id', 
    'crawl_date', 
    'dislike_count', 
    'display_id', 
    'duration', 
    'like_count',
    'upload_date',
    'view_count'

]
columns = [f'"{str(col)}"' for col in columns_to_keep]

# Write the headers in neach CSV
for year in years:
    output[year].write(','.join(columns))
    output[year].write('\n')

# Put each video from the json file in the CSV with the correct year
for video in tqdm(videos):
    data_to_write = [str(video[field]) for field in columns_to_keep]
    date_video = str(video['upload_date'])
    year_video = date_video[:4]
    
    written = False
    for year in years:
        if year == year_video:
            output[year].write(','.join(data_to_write))
            output[year].write('\n')
            written = True
            break
    
print('Finished') # 9min50s
for year in years:
    output[year].close()
f.close()

In [None]:
%%time
# Convert each CSV file to a parquet file (faster to read)
types_col = {    
    'categories': 'string', 
    'channel_id': 'string', 
    'dislike_count': 'Int32',
    'display_id': 'string', 
    'duration': 'string', 
    'like_count': 'Int64',
    'view_count': 'Int64'
}
for year in tqdm(years):
    v = pd.read_csv(
        f'generated/{year}/{year}_videos_few_columns.csv', 
        sep=',', 
        header=0, 
        dtype=types_col, 
        parse_dates=['crawl_date', 'upload_date'],
        na_values=['None'],
        engine='c'
    ) # 6min36s [[Djian: It might be faster using pyarrow csv]]
    v.to_parquet(f'generated/{year}/{year}_videos_few_columns.parquet', compression=None)

In [None]:
df_videos = {
    year: pd.read_parquet(f'generated/{year}/{year}_videos_few_columns.parquet', engine='fastparquet') 
    for year in years
}

### Split the big json into CSV files (one for each year, keeping all columns)

In [None]:
# Create CSVs that split the big json file into years

f = open(data_path + 'yt_metadata_en.jsonl')
videos = ijson.items(f, '', multiple_values=True)

output = {year: open(f'generated/{year}/{year}_videos.csv', 'w') for year in years}

columns_to_drop = [
    'description', 
    'tags', 
    'title'
]
columns_to_keep = [
    'categories', 
    'channel_id', 
    'crawl_date', 
    'dislike_count', 
    'display_id', 
    'duration', 
    'like_count',
    'upload_date',
    'view_count'

]

columns = columns_to_keep + columns_to_drop

writer = {year: csv.DictWriter(output[year], fieldnames=columns) for year in years}

for year in years:
    writer[year].writeheader()

# Put each video from the json file in the CSV with the correct year
for video in tqdm(videos):
    data_to_write = [str(video[field]) for field in columns_to_keep + columns_to_drop]
    date_video = str(video['upload_date'])
    year_video = date_video[:4]
    
    written = False
    for year in years:
        if year == year_video:
            writer[year].writerow(video)
    
    
print('Finished') # 9min50s
for year in years:
    output[year].close()
f.close()

### Some helper functions for feature extraction

In [None]:
# Helper function to count the number of negative words in a text

# Load the dataset of negative words
neg_words = set(open('negative-words.txt', mode='r', encoding='iso-8859-1').read().strip().split("\n"))

def count_neg_words(text, fieldname=''):
    ''' Count the number of words and the number of negative words in the text
    
        :param text: a string
        :param field: the name of the field
        
        :return: dictionary of features (nb_words, nb_negative_words)
    '''
    words = set(word.lower() for word in text.split(' '))
    nb_negative = len(words.intersection(neg_words))
    nb_words = len(words)
    d =  {
        f'count_words_{fieldname}': nb_words,
        f'count_negative_words_{fieldname}': nb_negative
    }
    return d

In [None]:
# Helper function for sentiment analysis

# Load VADER from nltk
sia = SentimentIntensityAnalyzer()

def sentiment(text, fieldname=''):
    ''' Perform sentiment analysis
    
        :param text: a string
        :param field: the name of the field
        
        :return: dictionary of features (negative, neutral, positive, compound)
    '''
    
    negative, neutral, positive, compound = sia.polarity_scores(text).values()
    d = {
        f'sia_negative_{fieldname}': negative,
        f'sia_neutral_{fieldname}': neutral,
        f'sia_positive_{fieldname}': positive,
        f'sia_compound_{fieldname}': compound
    }
    return d

In [None]:
def extract_features(text_to_features, year, field):
    ''' Extract the video features according to a specified function, on a given year, on a given field.
        
        :param text_to_features: function that maps a string to a list of features. The prototype must be
            text_to_features(text, fieldname) -> dict of features.
        :param year: string of the year
        :param field: name of the video field to analyse (string)
        
        :return: DataFrame with the features (each row corresponds to a video, the features are columns)
    '''
    
    print('Computing features')
    features_list = [] # list of dicts
    with open(f'generated/{year}/{year}_videos.csv', "r") as f:
        reader = csv.DictReader(f, delimiter=",")
        for video in tqdm(reader):
            features_video = text_to_features(video[field], fieldname=field)
            features_video
            features_list.append(features_video)
    print('...done.')

    print('Converting features to dataframe...')
    features_list = pd.DataFrame.from_dict(features_list)
    print('...done.')
    
    return features_list

### Extract the features

In [None]:
# Count negative words in titles in 2019
df_neg_words_title = extract_features(text_to_features=count_neg_words, year='2019', field='title') # 3min22s

In [None]:
# Count negative words in descriptions in 2019
df_neg_words_desc = extract_features(text_to_features=count_neg_words, year='2019', field='description') # 6min30s

In [None]:
# Sentiment analysis on titles in 2019
df_sia = extract_features(text_to_features=sentiment, year='2019', field='title') # 20min

In [None]:
# Sentiment analysis on descriptions in 2019
#df_sia_desc = extract_features(text_to_features=sentiment, year='2019', field='description') # ~3h (projection) 

In [None]:
# Join all the dataframes of features
df_features = df_neg_words_title.join(df_neg_words_desc).join(df_sia)

In [None]:
# Read the df of videos from 2019 (without titles, tags and descriptions, to have a small file)
videos_few_cols = pd.read_parquet('generated/2019/2019_videos_few_columns.parquet', engine='fastparquet')

In [None]:
# Join the features to the videos
videos_features = videos_few_cols.join(df_features)

In [None]:
# Store the df videos with features to a parquet file
videos_features.to_parquet(f'generated/2019/2019_videos_CountNegWords_Sentiment.parquet', compression=None) # 3s

### How to load the dataframe with videos and features (from 2019)

In [None]:
# Run:
videos_features = pd.read_parquet('generated/2019/2019_videos_CountNegWords_Sentiment.parquet', engine='fastparquet')