In [None]:
''' Dataset information
| Data File                   | Size     | Comment                                    |
|:----------------------------|:---------|:-------------------------------------------|
| _raw_df_channels.tsv.gz     | 6.4 MB   | See 'create_sample_datasets_tsv.ipynb'     |
| _raw_df_timeseries.tsv.gz   | 653.1 MB | See 'create_sample_datasets_tsv.ipynb'     |
| _raw_yt_metadata.jsonl.zst  | 14.7 GB  | Sample dataset created.                    |
| df_channels_en.tsv.gz       | 6.0 MB   | See 'create_sample_datasets_tsv.ipynb'     |
| df_timeseries_en.tsv.gz     | 571.1 MB | See 'create_sample_datasets_tsv.ipynb'     |
| num_comments.tsv.gz         | 754.6 MB | See 'create_sample_datasets_tsv.ipynb'     |
| num_comments_authors.tsv.gz | 1.4 GB   | See 'create_sample_datasets_tsv.ipynb'     |
| youtube_comments.tsv.gz     | 77.2 GB  | See 'create_sample_datasets_tsv.ipynb'     |
| yt_metadata_en.jsonl.gz     | 13.6 GB  | Sample dataset created.                    |
| yt_metadata_helper.feather  | 2.8 GB   | See 'create_sample_datasets_feather.ipynb' |
'''

In [None]:
import os
import gzip
import json
import pandas as pd
import random
import io
import zstandard as zstd

# Verify current working directory
print(f"Current working directory: {os.getcwd()}")

In [None]:
# Parameters for both json and zst file sampling
NUM_ITEMS = 85_000_000
SAMPLE_SIZE = 50_000

JSON_FILE_NAME = 'yt_metadata_en'
JSON_PATH = f'../../RawData/{JSON_FILE_NAME}' + '.jsonl.gz'

ZST_FILE_NAME = '_raw_yt_metadata'
ZST_PATH = f"../../RawData/{ZST_FILE_NAME}.jsonl.zst"

### Read .jsonl.gz files

In [None]:
# Read and display some records from the JSONL file
records_json = []
with gzip.open(JSON_PATH, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= NUM_ITEMS:
            break
        records_json.append(json.loads(line))

print(f"Read {len(records_json)} items from {JSON_PATH}")
print("\nExample record:")
print(json.dumps(records_json[0], indent=2))
print("\nSample of 5 records:")
for record in records_json[:5]:
    print(record)

In [None]:
# Randomly sample 50,000 records into a pandas DataFrame
sample_records = random.sample(records_json, SAMPLE_SIZE)
df_sample_json = pd.DataFrame(sample_records)
print(f"Sampled dataset shape: {df_sample_json.shape}")
df_sample_json.head()

In [None]:
# Write to .csv file
df_sample_json.to_csv(f'../SampleData/{JSON_FILE_NAME}' + '_sample.csv', index=False)

### Read .jsonl.zst files

In [None]:
# Stream-read .jsonl.zst, sample 50k, create DataFrame and save
records_zst = []
print(f"Reading up to {NUM_ITEMS:,} lines from {ZST_PATH}")
with open(ZST_PATH, 'rb') as fh:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(fh) as reader:
        text_stream = io.TextIOWrapper(reader, encoding='utf-8')
        for i, line in enumerate(text_stream):
            if i >= NUM_ITEMS:
                break
            records_zst.append(json.loads(line))

print(f"Total records read: {len(records_zst):,}")
print("Example record (first):")
print(json.dumps(records_zst[0], indent=2))

In [None]:
# Random sample
sample_zst_records = random.sample(records_zst, SAMPLE_SIZE)
df_sample_zst = pd.DataFrame(sample_zst_records)
print(f"Sampled DataFrame shape: {df_sample_zst.shape}")
df_sample_zst.head()

In [None]:
# Write to .csv file
output_path = f"../SampleData/{ZST_FILE_NAME}_sample.csv"
df_sample_zst.to_csv(output_path, index=False)

### Test files

In [None]:
df1 = pd.read_csv(f"../SampleData/{JSON_FILE_NAME}_sample.csv")
df2 = pd.read_csv(f"../SampleData/{ZST_FILE_NAME}_sample.csv")

In [None]:
print(df1.shape)
df1.head()

In [None]:
print(df2.shape)
df2.head()