# Run notebook to save dataframes containing rows with video metadata for a specific category

In [2]:
import random
import os
import json
import gzip
import pandas as pd
import numpy as np
from config import DRIVE_PATH_VIVA

In [None]:
def filter_jsonl(input_path, category, batch_size, save_path, verbose = False):
    """Unzips input jsonl data then extracts rows with given category and saves them in batches

    Args:
        input (str): path to yt_metadata_en.jsonl.gz (incl)
        category (str): from the options in channel metadata
        batch_size (int): number of videos per batch
        save_path (str): path to folder where you want the batch dataframes to be saved
        verbose (bool, optional): print info. Defaults to False.
    """
    
    batch_index = -1 # so we can start with index 0
    line_counter = 0
    category_counter = 0
    renew_list = True # bc issue: 0 % anythig = 0
    with gzip.open(input_path, 'rt', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            line_counter +=1
            
            # create new batch list
            if category_counter % batch_size == 0 and renew_list:
                renew_list = False
                filtered_data = []
                batch_index +=1
                if verbose:
                    print(f'======== Batch {batch_index} - started at {line_counter} ========')
            
            if entry.get('categories') == category:
                category_counter +=1
                filtered_data.append(entry)
                
                if verbose:
                    if category_counter != 0 and category_counter % 100000 == 0:
                        print(f'Filtered {category_counter} {category} videos out of {line_counter} so far') 
                
                if len(filtered_data) == batch_size: #save
                    df_filtered = pd.DataFrame(filtered_data)
                    df_filtered.to_csv(os.path.join(save_path, f'{category}_videos_{batch_index}.csv'))
                    renew_list = True
                    if verbose: 
                        print(f"We filtered a total of {category_counter} videos in the {category} category!")
        
        df_filtered = pd.DataFrame(filtered_data)
        df_filtered.to_csv(os.path.join(save_path, f'{category}_videos_{batch_index}.csv'))
        
        print(f"We filtered a total of {category_counter} videos in the {category} category!")

In [None]:
category = 'Education'
video_file_path = os.path.join(DRIVE_PATH_VIVA, f'extracted_{category}', 'yt_metadata_en.jsonl.gz') # change so it is your path (cannot do absolute)
save_path = DRIVE_PATH_VIVA # change so you save it wherever you like
total_len = 3795564 #3'795'564 (we knew from Gongon's notebook)
approx_length = round(total_len/1000000,1)*1000000 
n_batches = 8

batch_size = int(np.floor_divide(approx_length,n_batches)) # for 8 batches should end up with 475'000


filter_jsonl(video_file_path, category, batch_size, save_path, True)

In [None]:
# edited Gonçalo's
# def filter_jsonl(input, category, batch_size, start = 0, all = False, randomness = True, random_seed = 0, verbose = False):
#     filtered_data = []
#     random.seed(random_seed)
#     line_counter = 0
#     counter = 0
    
#     with gzip.open(input, 'rt', encoding='utf-8') as f:
#         for line in f:
#             entry = json.loads(line)
#             line_counter +=1

#             if line_counter >= start:
#                 #print(f'{line_counter} >= {start}')
                
#                 if entry.get('categories') == category:
#                     counter +=1
#                     if verbose:
#                         if counter != 0 and counter % 10000 == 0:
#                             print(f'Filtered {counter} {category} videos out of {line_counter} so far') 
#                     if len(filtered_data) < batch_size or all:
#                         filtered_data.append(entry)
                        
#                     elif randomness:
#                         index_to_replace = random.randint(0, len(filtered_data) - 1)
#                         if index_to_replace < batch_size:
#                             filtered_data[index_to_replace] = entry
#                     else:
                        
#                         break
      
#     if randomness:           
#         print(f"There are {counter} videos in the {category} category!")
#     else:
#         print(f"We filtered {len(filtered_data)} videos in the {category} category!")

#     return pd.DataFrame(filtered_data)