# Generate subdatasets from the original dataset

This notebook generates subdatasets from the original dataset, to be used to test our implementations without having to wait for the whole dataset to be processed.

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
PATH_DATA = '../data/'
PATH_SUBDATA = '../data/subdata/'

## Metadata

In [3]:
METADATAS_SAMPLE_RATIO = 0.0001
METADATAS_SAMPLE_SEED = 0

METADATAS_FILE = 'yt_metadata_en'
PATH_METADATAS_SRC = PATH_DATA + METADATAS_FILE + '.jsonl.gz'
PATH_METADATAS_DST = PATH_SUBDATA + METADATAS_FILE + '_sub'

### Write to file

In [35]:
metadatas = spark.read.json(PATH_METADATAS_SRC)

72924794

In [36]:
sub_metadatas = metadatas.sample(False, METADATAS_SAMPLE_RATIO, seed=METADATAS_SAMPLE_SEED)
sub_metadatas.write.json(PATH_METADATAS_DST)

### Read from file

In [37]:
spark.read.json(PATH_METADATAS_DST).count()

7042

## Channels

In [38]:
CHANNELS_FILE = 'df_channels_en'
PATH_CHANNELS_SRC = PATH_DATA + CHANNELS_FILE + '.tsv.gz'
PATH_CHANNELS_DST = PATH_SUBDATA + CHANNELS_FILE + '_sub'

### Write to file

In [39]:
channels = spark.read \
    .option('header', 'true') \
    .option('sep', '\t') \
    .csv(PATH_CHANNELS_SRC)

136470

In [40]:
sub_channels_metadata = sub_metadatas.select('channel_id').distinct()
sub_channels = channels.join(sub_channels_metadata, channels.channel == sub_channels_metadata.channel_id, how='inner')

In [41]:
sub_channels.write \
    .option('header', 'true') \
    .option('sep', '\t') \
    .csv(PATH_CHANNELS_DST)

### Read from file

In [42]:
spark.read.json(PATH_CHANNELS_DST).count()

5699

# Time Series

### Write to file

In [43]:
TIMESERIES_FILE = 'df_timeseries_en'
PATH_TIMESERIES_SRC = PATH_DATA + TIMESERIES_FILE + '.tsv.gz'
PATH_TIMESERIES_DST = PATH_SUBDATA + TIMESERIES_FILE + '_sub'

In [44]:
time_series = spark.read \
    .option('header', 'true') \
    .option('sep', '\t') \
    .csv(PATH_TIMESERIES_SRC)

18872499

In [45]:
sub_time_series = time_series.join(sub_channels_metadata, time_series.channel == sub_channels_metadata.channel_id, how='inner')

In [46]:
sub_time_series.write \
    .option('header', 'true') \
    .option('sep', '\t') \
    .json(PATH_TIMESERIES_DST)

### Read from file

In [47]:
spark.read.json(PATH_TIMESERIES_DST).count()

868191