In [None]:
import pandas as pd
from tqdm import tqdm

METADATA_FILENAME = 'data/yt_metadata_en.jsonl'

decline_events = pd.read_csv('data/SAMPLE.csv')

decline_events

In [None]:
channels = decline_events['Channel'].unique()

In [None]:
def map_column_to_week(df, column_name):
    """
    Replace the given column by a week index,
    starting from 0 in the earliest week found in the dataset

    Parameters:
    df  (pd.DataFrame): the dataframe in which to replace the week index
    column_name (str): the column to replace (must be a date: 'datetime' for timeseries, 'upload_date' for metadata)
    Return:
    df_week (pd.DataFrame): the dataframe with the week index
    """

    # The first date in the metadata is 2015-01-05, handle the time lag between the first date in metadata_helper and timeseries => keep only the data from 2015-01-05 (aligned with timeseries)
    first_date = pd.to_datetime('2015-01-05 00:00:00')

    # Get the first date in the dataset
    if column_name == 'upload_date':
        # Drop all raws with upload_date before 2015-01-05
        df = df[df['upload_date'] >= first_date]

    # Compute the week index
    df.loc[:,'week'] = df[column_name].apply(lambda x: (x - first_date).days // 7)

    df_week = df.drop(column_name, axis=1)

    # Remove the datetime column
    return df_week

In [None]:
# open the file 'videos.jsonl' in batches of 5000 rows, and add the categories to the result dataframe when the videos is between start - TIME_BEFORE and end + TIME_AFTER
CHUNK_SIZE = 5000

# remove SettingWithCopyWarning: 
pd.options.mode.chained_assignment = None  # default='warn'

i = 0
for chunk in pd.read_json(METADATA_FILENAME, lines=True, chunksize=CHUNK_SIZE):

    try:
        init_shape = int(chunk.shape[0])
        
        chunk = chunk[chunk['channel_id'].isin(channels)]
        
        chunk.loc[:, 'upload_date'] = pd.to_datetime(chunk['upload_date'])

        chunk = map_column_to_week(chunk, 'upload_date')

        # keep the videos that are in the time frame for the respective channel
        mask = []
        for video in chunk.itertuples():
            channel_mask = decline_events['Channel'].isin([video.channel_id])
            start_mask = decline_events['Start'] - decline_events['Duration'] <= video.week
            end_mask = decline_events['End'] >= video.week

            mask.append(decline_events[channel_mask & start_mask & end_mask].shape[0] > 0)

        kept = chunk[mask]

        if kept.shape[0] == 0:
            print(f'Chunk {i} (lines {i*CHUNK_SIZE} to {(i+1)*CHUNK_SIZE}): 0/{init_shape} videos kept')
            i += 1
            continue

        cols_of_interest = ['channel_id', 'week', 'tags', 'duration']

        kept = kept[cols_of_interest]

        print(f'Chunk {i} (lines {i*CHUNK_SIZE} to {(i+1)*CHUNK_SIZE}): {kept.shape[0]}/{init_shape} videos kept')

        kept.to_csv(f'data/categories.csv', index=False, mode='a', header=False)

        i += 1
    
    except Exception as e:
        print(f'Error in chunk {i} (lines {i*CHUNK_SIZE} to {(i+1)*CHUNK_SIZE}): {e}')