In [1]:
''' Dataset information
| Data File                   | Size     | Comment                                    |
|:----------------------------|:---------|:-------------------------------------------|
| _raw_df_channels.tsv.gz     | 6.4 MB   | See 'create_sample_datasets_tsv.ipynb'     |
| _raw_df_timeseries.tsv.gz   | 653.1 MB | See 'create_sample_datasets_tsv.ipynb'     |
| _raw_yt_metadata.jsonl.zst  | 14.7 GB  | Sample dataset created.                    |
| df_channels_en.tsv.gz       | 6.0 MB   | See 'create_sample_datasets_tsv.ipynb'     |
| df_timeseries_en.tsv.gz     | 571.1 MB | See 'create_sample_datasets_tsv.ipynb'     |
| num_comments.tsv.gz         | 754.6 MB | See 'create_sample_datasets_tsv.ipynb'     |
| num_comments_authors.tsv.gz | 1.4 GB   | See 'create_sample_datasets_tsv.ipynb'     |
| youtube_comments.tsv.gz     | 77.2 GB  | See 'create_sample_datasets_tsv.ipynb'     |
| yt_metadata_en.jsonl.gz     | 13.6 GB  | Sample dataset created.                    |
| yt_metadata_helper.feather  | 2.8 GB   | See 'create_sample_datasets_feather.ipynb' |
'''

" Dataset information\n| Data File                   | Size     | Comment                                    |\n|:----------------------------|:---------|:-------------------------------------------|\n| _raw_df_channels.tsv.gz     | 6.4 MB   | See 'create_sample_datasets_tsv.ipynb'     |\n| _raw_df_timeseries.tsv.gz   | 653.1 MB | See 'create_sample_datasets_tsv.ipynb'     |\n| _raw_yt_metadata.jsonl.zst  | 14.7 GB  | Sample dataset created.                    |\n| df_channels_en.tsv.gz       | 6.0 MB   | See 'create_sample_datasets_tsv.ipynb'     |\n| df_timeseries_en.tsv.gz     | 571.1 MB | See 'create_sample_datasets_tsv.ipynb'     |\n| num_comments.tsv.gz         | 754.6 MB | See 'create_sample_datasets_tsv.ipynb'     |\n| num_comments_authors.tsv.gz | 1.4 GB   | See 'create_sample_datasets_tsv.ipynb'     |\n| youtube_comments.tsv.gz     | 77.2 GB  | See 'create_sample_datasets_tsv.ipynb'     |\n| yt_metadata_en.jsonl.gz     | 13.6 GB  | Sample dataset created.                    |

In [2]:
import os
import gzip
import json
import pandas as pd
import random
import io
import zstandard as zstd

# Verify current working directory
print(f"Current working directory: {os.getcwd()}")

Current working directory: d:\_edd\_local_repos\cs513_data_mining\trendy-tube\Preprocessing


### Read .jsonl.gz files

In [3]:
# Parameters for both json and zst file sampling
NUM_ITEMS = 1_000_000
SAMPLE_SIZE = 50_000

JSON_FILE_NAME = 'yt_metadata_en'
JSON_PATH = f'../RawData/{JSON_FILE_NAME}' + '.jsonl.gz'

# Read and display some records from the JSONL file
records_json = []
with gzip.open(JSON_PATH, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= NUM_ITEMS:
            break
        records_json.append(json.loads(line))

print(f"Read {len(records_json)} items from {JSON_PATH}")
print("\nExample record:")
print(json.dumps(records_json[0], indent=2))
print("\nSample of 5 records:")
for record in records_json[:5]:
    print(record)

Read 1000000 items from ../RawData/yt_metadata_en.jsonl.gz

Example record:
{
  "categories": "Film & Animation",
  "channel_id": "UCzWrhkg9eK5I8Bm3HfV-unA",
  "crawl_date": "2019-10-31 20:19:26.270363",
  "description": "Lego City Police Lego Firetruck Cartoons about Lego City Movie for kids with a nice long video to keep the kids entertained while learning and having fun with the Lego City characters from Lego City undercover. Thanks for watching!",
  "dislike_count": 1.0,
  "display_id": "SBqSc91Hn9g",
  "duration": 1159,
  "like_count": 8.0,
  "tags": "lego city,lego police,lego city police,lego city episodes,videos de lego city,lego policia,lego bomberos,lego fire truck,lego firetruck,lego police chase,lego robbers,lego cartoons,lego movies,lego videos for kids",
  "title": "Lego City Police Lego Firetruck Cartoons about Lego City Movie for kids Episodes",
  "upload_date": "2016-09-28 00:00:00",
  "view_count": 1057.0
}

Sample of 5 records:
{'categories': 'Film & Animation', 'cha

In [4]:
# Randomly sample 50,000 records into a pandas DataFrame
sample_records = random.sample(records_json, SAMPLE_SIZE)
df_sample_json = pd.DataFrame(sample_records)
print(f"Sampled dataset shape: {df_sample_json.shape}")
df_sample_json.head()

Sampled dataset shape: (50000, 12)


Unnamed: 0,categories,channel_id,crawl_date,description,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count
0,Education,UCrjuXJV0UuvaeYvXjQdWDMw,2019-11-13 12:32:11.623277,"Past tense of 'Bite' and other Forms of Verb ""...",8.0,0cal500DZRE,240,60.0,"english academy,the english academy,learn engl...","Past tense of BITE and other Forms of Verb ""BI...",2018-08-24 00:00:00,6270.0
1,Gaming,UCrstvVjN1FYwuSm8IPSuyow,2019-11-01 11:19:03.289504,"With Update 12 and One Tamriel, important chan...",21.0,NylOXO-H6nE,115,135.0,"The Elder Scrolls Online,Craglorn,ZeniMax Medi...",The Elder Scrolls Online Guide: Return to Crag...,2016-09-15 00:00:00,11740.0
2,Nonprofits & Activism,UCrcrDXK620kopfSffPxrdTA,2019-11-02 09:31:38.375019,Website: http://www.sermonindex.net | Twitter:...,3.0,eFap1L1Pe0Q,1548,53.0,"Chuck,Smith,Revival,Repent,God,Sermons,Jesus,S...","Our Gracious, Compassionate, Merciful God by C...",2011-05-08 00:00:00,5210.0
3,News & Politics,UCruQg25yVBppUWjza8AlyZA,2019-10-30 19:48:25.790147,http://www.myfoxdfw.com\n\nFOX 4 News is a FOX...,0.0,-M7RuWz1oyk,131,0.0,"FOX 4,KDFW,Dallas,Fort Worth,Good Day",Simply Grace,2013-04-24 00:00:00,94.0
4,Music,UCriIdX-phM0jQB3VhL4NaLQ,2019-11-02 14:16:32.434657,http://musicfog.com Melissa McClelland & Luke ...,0.0,otWBw5pFigk,318,107.0,"Melissa McClelland,Luke Doucet,Glen Rio,Glenri...","Melissa McClelland & Luke Doucet ""Glenrio""",2011-07-23 00:00:00,16824.0


In [5]:
# Write to .csv file
df_sample_json.to_csv(f'../SampleData/{JSON_FILE_NAME}' + '_sample.csv', index=False)

### Read .jsonl.zst files

In [6]:
# Stream-read .jsonl.zst, sample 50k, create DataFrame and save
ZST_FILE_NAME = '_raw_yt_metadata'
ZST_PATH = f"../RawData/{ZST_FILE_NAME}.jsonl.zst"

records_zst = []
print(f"Reading up to {NUM_ITEMS:,} lines from {ZST_PATH}")
with open(ZST_PATH, 'rb') as fh:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(fh) as reader:
        text_stream = io.TextIOWrapper(reader, encoding='utf-8')
        for i, line in enumerate(text_stream):
            if i >= NUM_ITEMS:
                break
            records_zst.append(json.loads(line))

print(f"Total records read: {len(records_zst):,}")
print("Example record (first):")
print(json.dumps(records_zst[0], indent=2))

Reading up to 1,000,000 lines from ../RawData/_raw_yt_metadata.jsonl.zst
Total records read: 1,000,000
Example record (first):
{
  "categories": "Entertainment",
  "channel_id": "UCzzzrOhp92PkGrIwGH3_EEg",
  "crawl_date": "2019-11-22 18:08:02.988358",
  "description": "\u041c\u043e\u0439 \u0412\u041a: https://vk.com/holdik_clash\n\u041c\u043e\u0439 \u0438\u043d\u0441\u0442\u0430\u0433\u0440\u0430\u043c - https://www.instagram.com/vanya.holdik/\n\u041c\u043e\u0438 \u0432\u043b\u043e\u0433\u0438 - https://goo.gl/cFofas",
  "dislike_count": 48,
  "display_id": "e3qGZIDcMK4",
  "duration": 495,
  "like_count": 5138,
  "tags": "\u0445\u043e\u043b\u0434\u0438\u043a,\u0445\u043e\u043b\u0434\u0438\u043a \u0431\u043e\u043c\u0431\u0438\u0442,\u0445\u043e\u043b\u0434\u0438\u043a \u043d\u0430\u0440\u0435\u0437\u043a\u0430,\u043d\u0430\u0440\u0435\u0437\u043a\u0430 \u0441\u043e \u0441\u0442\u0440\u0438\u043c\u0430,\u043d\u0430\u0440\u0435\u0437\u043a\u0430 \u0441\u043e \u0441\u0442\u0440\u0438\u043

In [7]:
# Random sample
sample_zst_records = random.sample(records_zst, SAMPLE_SIZE)
df_sample_zst = pd.DataFrame(sample_zst_records)
print(f"Sampled DataFrame shape: {df_sample_zst.shape}")
df_sample_zst.head()

Sampled DataFrame shape: (50000, 12)


Unnamed: 0,categories,channel_id,crawl_date,description,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count
0,Entertainment,UCzIXwi5CdyEyNp6MHCTpZWA,2019-11-07 20:52:53.865564,Copyright: Gold Music & Mirnes Dudic\n\nhttp:/...,0.0,7WLQSnS47jI,158,0.0,"Gold Music,tv,muza,televizija,emisija,show,sho...",Mirnes Dudic - Sto se igras - Gold Music - ( T...,2015-10-09 00:00:00,9
1,Gaming,UCzPKqMawrz7ov21YBEXAZzg,2019-11-14 16:40:13.257589,RESOURCEFUL! - Fallout Origins #1.5 (Minecraft...,0.0,IVcrLxBDDlo,4756,116.0,"minecraft post apocalypse mod,minecraft post a...",RESOURCEFUL! - Fallout Origins #1.5 (Minecraft...,2018-12-16 00:00:00,3985
2,Entertainment,UCzR7770PbrKcG9OYGzBep9w,2019-10-30 08:29:06.136159,"Also, do not forget to subscribe to Bollywood ...",5.0,ql_jOUP5xHM,874,156.0,"kajol movie interview,incredible 2 movie inter...",Kajol's AMAZING full interview on Incredibles ...,2018-06-20 00:00:00,15884
3,Entertainment,UCzo4OXE8JxogJHWJ2SypiNg,2019-10-31 23:42:58.138880,"NEW ""Strike Out"" HATS AND SHIRTS! http://www.O...",30.0,IPfHRerNhIg,567,1136.0,"mornin oats,weight loss,john glaude,how to los...",Mornin' Oats: Do You NEED to Track Macros to L...,2016-07-17 00:00:00,35630
4,Entertainment,UCzsvSXSk01GJc0QDUt7eYiA,2019-10-30 01:40:35.396941,Disney Cinemagic Germany - LILO & STITCH - Pro...,4.0,Ucw9tia5a9I,30,13.0,"bumper,animation,scandinavia,skandinavien,sver...",Disney Cinemagic Germany - LILO & STITCH - Promo,2011-12-23 00:00:00,9204


In [None]:
# Write to .csv file
output_path = f"../SampleData/{ZST_FILE_NAME}_sample.csv"
df_sample_zst.to_csv(output_path, index=False)

Saved sampled CSV to: ../SampleData/_raw_yt_metadata_sample.csv


### Test files

In [9]:
df1 = pd.read_csv(f"../SampleData/{JSON_FILE_NAME}_sample.csv")
df2 = pd.read_csv(f"../SampleData/{ZST_FILE_NAME}_sample.csv")

In [10]:
print(df1.shape)
df1.head()

(50000, 12)


Unnamed: 0,categories,channel_id,crawl_date,description,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count
0,Education,UCrjuXJV0UuvaeYvXjQdWDMw,2019-11-13 12:32:11.623277,"Past tense of 'Bite' and other Forms of Verb ""...",8.0,0cal500DZRE,240,60.0,"english academy,the english academy,learn engl...","Past tense of BITE and other Forms of Verb ""BI...",2018-08-24 00:00:00,6270.0
1,Gaming,UCrstvVjN1FYwuSm8IPSuyow,2019-11-01 11:19:03.289504,"With Update 12 and One Tamriel, important chan...",21.0,NylOXO-H6nE,115,135.0,"The Elder Scrolls Online,Craglorn,ZeniMax Medi...",The Elder Scrolls Online Guide: Return to Crag...,2016-09-15 00:00:00,11740.0
2,Nonprofits & Activism,UCrcrDXK620kopfSffPxrdTA,2019-11-02 09:31:38.375019,Website: http://www.sermonindex.net | Twitter:...,3.0,eFap1L1Pe0Q,1548,53.0,"Chuck,Smith,Revival,Repent,God,Sermons,Jesus,S...","Our Gracious, Compassionate, Merciful God by C...",2011-05-08 00:00:00,5210.0
3,News & Politics,UCruQg25yVBppUWjza8AlyZA,2019-10-30 19:48:25.790147,http://www.myfoxdfw.com\n\nFOX 4 News is a FOX...,0.0,-M7RuWz1oyk,131,0.0,"FOX 4,KDFW,Dallas,Fort Worth,Good Day",Simply Grace,2013-04-24 00:00:00,94.0
4,Music,UCriIdX-phM0jQB3VhL4NaLQ,2019-11-02 14:16:32.434657,http://musicfog.com Melissa McClelland & Luke ...,0.0,otWBw5pFigk,318,107.0,"Melissa McClelland,Luke Doucet,Glen Rio,Glenri...","Melissa McClelland & Luke Doucet ""Glenrio""",2011-07-23 00:00:00,16824.0


In [11]:
print(df2.shape)
df2.head()

(50000, 12)


Unnamed: 0,categories,channel_id,crawl_date,description,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count
0,Entertainment,UCzIXwi5CdyEyNp6MHCTpZWA,2019-11-07 20:52:53.865564,Copyright: Gold Music & Mirnes Dudic\n\nhttp:/...,0.0,7WLQSnS47jI,158,0.0,"Gold Music,tv,muza,televizija,emisija,show,sho...",Mirnes Dudic - Sto se igras - Gold Music - ( T...,2015-10-09 00:00:00,9
1,Gaming,UCzPKqMawrz7ov21YBEXAZzg,2019-11-14 16:40:13.257589,RESOURCEFUL! - Fallout Origins #1.5 (Minecraft...,0.0,IVcrLxBDDlo,4756,116.0,"minecraft post apocalypse mod,minecraft post a...",RESOURCEFUL! - Fallout Origins #1.5 (Minecraft...,2018-12-16 00:00:00,3985
2,Entertainment,UCzR7770PbrKcG9OYGzBep9w,2019-10-30 08:29:06.136159,"Also, do not forget to subscribe to Bollywood ...",5.0,ql_jOUP5xHM,874,156.0,"kajol movie interview,incredible 2 movie inter...",Kajol's AMAZING full interview on Incredibles ...,2018-06-20 00:00:00,15884
3,Entertainment,UCzo4OXE8JxogJHWJ2SypiNg,2019-10-31 23:42:58.138880,"NEW ""Strike Out"" HATS AND SHIRTS! http://www.O...",30.0,IPfHRerNhIg,567,1136.0,"mornin oats,weight loss,john glaude,how to los...",Mornin' Oats: Do You NEED to Track Macros to L...,2016-07-17 00:00:00,35630
4,Entertainment,UCzsvSXSk01GJc0QDUt7eYiA,2019-10-30 01:40:35.396941,Disney Cinemagic Germany - LILO & STITCH - Pro...,4.0,Ucw9tia5a9I,30,13.0,"bumper,animation,scandinavia,skandinavien,sver...",Disney Cinemagic Germany - LILO & STITCH - Promo,2011-12-23 00:00:00,9204
