# Imports

In [28]:
import gzip
import json
import matplotlib.pyplot as plt
import os
import pandas as pd
import pyarrow.ipc as ipc
import pyarrow.feather as feather

In [30]:
DATA_PATH = '../data/'
CATEGORY = ''

# Loading functions

In [31]:
def load_and_filter_jsonl_gz_file_by_chunks(filepath, category=CATEGORY, chunk_size=10000, exclude_fields=None):
    '''
    Spefcify the chunk size and the fiels to exclude or not if needed. 
    Return the video matedatas of video include in the global variable category
    '''
    if exclude_fields is None:
        exclude_fields = ['title', 'description']
    dataframe_chunks = []
    
    with gzip.open(filepath, 'rt', encoding='utf-8') as f:
        records = []
        for line in f:
            # Exclude unwanted fields here
            record = {k: v for k, v in json.loads(line).items() if k not in exclude_fields}
            records.append(record)
            
            if len(records) == chunk_size:
                df_chunk = pd.DataFrame(records)
                df_filtered = df_chunk[df_chunk['categories'] == category].drop(columns=exclude_fields, errors='ignore')
                dataframe_chunks.append(df_filtered)
                records = []
        
        # Make sure to process the last chunk
        if records:
            df_chunk = pd.DataFrame(records)
            df_filtered = df_chunk[df_chunk['categories'] == category].drop(columns=exclude_fields, errors='ignore')
            dataframe_chunks.append(df_filtered)
    
    return pd.concat(dataframe_chunks, ignore_index=True)

def read_and_filter_feather(path, category=CATEGORY):
    # Open the feather file using pyarrow.ipc's RecordBatchFileReader
    reader = ipc.RecordBatchFileReader(path)

    # Initialize an empty list to store the filtered DataFrames
    filtered_dfs = []
    # Loop over all batches
    num_batches = reader.num_record_batches
    for i in range(num_batches):
        # Get the ith batch
        batch = reader.get_batch(i)
        
        # Convert the batch to a pandas DataFrame
        df = batch.to_pandas()
        
        # Filter the DataFrame by category
        filtered_df = df[df['categories'] == category]
        
        # If the filtered DataFrame is not empty, add it to the list
        if not filtered_df.empty:
            filtered_dfs.append(filtered_df)

    # Once the loop is complete, concatenate all the filtered DataFrames
    final_df = pd.concat(filtered_dfs, ignore_index=True)
    return final_df

# Data loading

In [32]:
df_channels_en = pd.read_csv(f"{DATA_PATH}/df_channels_en.tsv.gz", compression="infer", sep="\t") 
df_timeseries_en = pd.read_csv(f"{DATA_PATH}/df_timeseries_en.tsv.gz", compression="infer", sep="\t") # 20s

In [24]:
df_feather = feather.read_feather(os.path.join(DATA_PATH, 'yt_metadata_helper.feather'))

In [26]:
df_feather.sort_values(by='like_count')

Unnamed: 0,categories,channel_id,dislike_count,display_id,duration,like_count,upload_date,view_count
26399482,Pets & Animals,UCXnvLHEf5t4ncz2DCLRMmpQ,0.0,mN88k1HrcYM,29,0.0,2017-04-19,26.0
65183210,Travel & Events,UC3KfELiIm0fy_VhiE37xKWQ,0.0,-TTYpmTHTJ8,125,0.0,2012-05-16,116.0
65183202,Travel & Events,UC3KfELiIm0fy_VhiE37xKWQ,1.0,9NqUZO0P6Sc,307,0.0,2012-06-04,396.0
65183201,News & Politics,UC3KfELiIm0fy_VhiE37xKWQ,0.0,DL-vCHMJVT8,1686,0.0,2012-06-05,68.0
65183200,News & Politics,UC3KfELiIm0fy_VhiE37xKWQ,0.0,c82Mux9ed-g,1677,0.0,2012-06-05,228.0
...,...,...,...,...,...,...,...,...
72899529,People & Blogs,UCrx5pVziMo1qzF8isicR5SQ,,59LrsaH6ZaA,24,,2014-08-27,5403358.0
72899530,People & Blogs,UCrx5pVziMo1qzF8isicR5SQ,,nBwIwOPCsNk,14,,2014-08-27,217454.0
72899584,People & Blogs,UCrx5pVziMo1qzF8isicR5SQ,,fSYOvDbTZrQ,19,,2012-09-07,6707.0
72899634,Film & Animation,UCrx3p5YDUOsD_RuKhfrXEIQ,,W-HTE1P5z4g,145,,2015-10-06,41657.0
