# Generate subdatasets from the original dataset

This notebook generates subdatasets from the original dataset, to be used to test our implementations without having to wait for the whole dataset to be processed.

In [1]:
import pandas as pd
import dask.dataframe as dd

## Metadata

In [2]:
NUM_METADATA_ROWS = 100000

PATH_DATA = '../data/'
PATH_SUBDATA = '../data/subdata/'

PATH_METADATA_SRC = PATH_DATA + 'yt_metadata_en.jsonl.gz'
PATH_METADATA_DST = PATH_SUBDATA + 'yt_metadata_en.parquet'

### Write to file

In [3]:
# Take a subset of videos
for df_metadata in pd.read_json(PATH_METADATA_SRC, compression="infer", chunksize=NUM_METADATA_ROWS, lines=True):
    #df_metadata.to_json(PATH_METADATA_DST, lines=True, orient="records", compression="infer")
    df_metadata.to_parquet(PATH_METADATA_DST, engine="pyarrow", compression="gzip", index=False)
    break # Take only one chunk

### Read from file

In [4]:
sub_metadata = dd.read_parquet(PATH_METADATA_DST, engine="pyarrow", compression="gzip", index=False)
sub_metadata.head(10)

Unnamed: 0,categories,channel_id,crawl_date,description,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count
0,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:26.270363,Lego City Police Lego Firetruck Cartoons about...,1.0,SBqSc91Hn9g,1159,8.0,"lego city,lego police,lego city police,lego ci...",Lego City Police Lego Firetruck Cartoons about...,2016-09-28 00:00:00,1057
1,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:26.914516,Lego Marvel SuperHeroes Lego Hulk Smash Iron-M...,1.0,UuugEl86ESY,2681,23.0,"Lego superheroes,lego hulk,hulk smash,lego mar...",Lego Marvel SuperHeroes Lego Hulk Smash Iron-M...,2016-09-28 00:00:00,12894
2,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:26.531203,Lego City Police Lego Fireman Cartoons about L...,779.0,oB4c-yvnbjs,1394,1607.0,"lego city,lego police,lego city police,lego fi...",Lego City Police Lego Fireman Cartoons about L...,2016-09-28 00:00:00,1800602
3,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:28.335329,Lego Harry Potter Complete Lego New Movie for ...,24.0,ZaV-gTCMV8E,5064,227.0,"Lego harry potter,new harry potter,harry potte...",Lego Harry Potter Complete Lego New Movie for ...,2016-09-28 00:00:00,57640
4,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:30.328487,Lego City Police LONG VIDEO for kids Lego Fire...,13.0,cGvL7AvMfM0,3554,105.0,"lego city,lego police,lego city police,lego fi...",Lego City Police 1 HOUR LONG VIDEO for kids Le...,2016-09-28 00:00:00,86368
5,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:32.141314,Lego Marvel SuperHeroes Hulk Smash Iron-Man Le...,1.0,iVADSRjaLtQ,1066,11.0,"lego marvel,lego marvel superheroes,lego super...",Lego Marvel SuperHeroes Hulk Smash Iron-Man Le...,2016-09-27 00:00:00,3426
6,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:25.218397,Lego City Fireman Lego Police Firetruck Cartoo...,2.0,ypVcK9mldPc,2450,11.0,"lego city,lego police,lego city police,lego fi...",Lego City Police LONG VIDEO Lego Firetruck Car...,2016-09-27 00:00:00,7652
7,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:25.792452,Lego City Police Lego Firetruck Long Movie Car...,0.0,qA1NayP9cNY,1539,9.0,"Lego city,lego police,lego city police,lego fi...",Lego City Police Lego Firetruck Long Movie Car...,2016-09-27 00:00:00,479
8,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:26.355550,Lego City Police Lego Fireman Firetruck Cartoo...,1.0,CWdSl9ta4Rg,2123,10.0,"lego city,lego police,lego city police,lego fi...",Lego City Police Lego Fireman Firetruck Cartoo...,2016-09-27 00:00:00,3856
9,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:27.027045,Lego City Police Lego Firetruck Movie Cartoons...,0.0,64dCTkxCHpY,1130,7.0,"lego city,lego police,lego city police,lego fi...",Lego City Police Lego Firetruck Movie Cartoons...,2016-09-27 00:00:00,1730
