# Imports

In [4]:
import pandas as pd
import numpy as np



# Set path to data files

In [5]:
dataset_root_path = "/media/andreas/Backup Plus/youniverse_dataset/"

In [6]:
# load channel data
df_channels = pd.read_csv(dataset_root_path + "df_channels_en.tsv.gz", compression="infer", sep="\t")

# load (first 100000 rows of) video data
df_videos = pd.read_json(dataset_root_path + "yt_metadata_en.jsonl.gz", compression="infer", lines=True, nrows=100000)

# load (first 1000000 rows of) comment data
df_comments = pd.read_csv(dataset_root_path + "youtube_comments.tsv.gz", compression="infer", sep="\t", nrows=1000000)

In [None]:
df_videos.head()

In [7]:
df_comments.head()

Unnamed: 0,author,video_id,likes,replies
0,1,Gkb1QMHrGvA,2,0
1,1,CNtp0xqoods,0,0
2,1,249EEzQmVmQ,1,0
3,1,_U443T2K_Bs,0,0
4,1,rJbjhm0weYc,0,0


In [22]:
def videos_in_chunks(chunksize: int = 100000):
    return pd.read_json(dataset_root_path + "yt_metadata_en.jsonl.gz", 
                        compression="infer", lines=True, chunksize=chunksize,)

def comments_in_chunks(chunksize: int = 1000000):
    return pd.read_csv(dataset_root_path + "youtube_comments.tsv.gz", 
                       compression="infer", sep="\t", chunksize=chunksize, )

In [21]:
df_videos.head()

Unnamed: 0,categories,channel_id,crawl_date,description,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count
0,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:26.270363,Lego City Police Lego Firetruck Cartoons about...,1.0,SBqSc91Hn9g,1159,8.0,"lego city,lego police,lego city police,lego ci...",Lego City Police Lego Firetruck Cartoons about...,2016-09-28 00:00:00,1057
1,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:26.914516,Lego Marvel SuperHeroes Lego Hulk Smash Iron-M...,1.0,UuugEl86ESY,2681,23.0,"Lego superheroes,lego hulk,hulk smash,lego mar...",Lego Marvel SuperHeroes Lego Hulk Smash Iron-M...,2016-09-28 00:00:00,12894
2,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:26.531203,Lego City Police Lego Fireman Cartoons about L...,779.0,oB4c-yvnbjs,1394,1607.0,"lego city,lego police,lego city police,lego fi...",Lego City Police Lego Fireman Cartoons about L...,2016-09-28 00:00:00,1800602
3,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:28.335329,Lego Harry Potter Complete Lego New Movie for ...,24.0,ZaV-gTCMV8E,5064,227.0,"Lego harry potter,new harry potter,harry potte...",Lego Harry Potter Complete Lego New Movie for ...,2016-09-28 00:00:00,57640
4,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2019-10-31 20:19:30.328487,Lego City Police LONG VIDEO for kids Lego Fire...,13.0,cGvL7AvMfM0,3554,105.0,"lego city,lego police,lego city police,lego fi...",Lego City Police 1 HOUR LONG VIDEO for kids Le...,2016-09-28 00:00:00,86368


In [23]:
def filter_vid_by_channel(channel_id: str, video_chunksize: int = 100000) -> pd.DataFrame :
    videos_filtered = pd.DataFrame(dict())
    with videos_in_chunks(video_chunksize) as reader:
        for i, video_chunk in enumerate(reader):

            print(f"Going through video chunk {i}...")
            videos_filtered = pd.concat([videos_filtered, video_chunk.loc[video_chunk.channel_id == channel_id]])
            print(f"The first {(i+1) * video_chunksize} videos have been processed.")
    return videos_filtered

def filter_comment_by_channel(channel_id: str,video_chunksize: int = 100000, comment_chunksize: int = 1000000) -> pd.DataFrame :
    comments_filtered = pd.DataFrame(dict())
    filtered_videos = filter_vid_by_channel(channel_id, video_chunksize=video_chunksize)
    print(f"Videos have been filtered by channel, {len(filtered_videos)} videos found. \nNow going through comments....")
    with comments_in_chunks(comment_chunksize) as reader:
        for i, comment_chunk in enumerate(reader):
            print(f"Going through comment chunk {i}...")
            comments_filtered = pd.concat([comments_filtered, 
                                comment_chunk.loc[comment_chunk.video_id.isin(filtered_videos)]])
            print(f"The first {(i+1) * comment_chunksize} comments have been processed")
    return comments_filtered

In [None]:
filtered_comments_test = filter_comment_by_channel("UCzWrhkg9eK5I8Bm3HfV-unA")

Going through video chunk 0...
The first 100000 videos have been processed.
Going through video chunk 1...
The first 200000 videos have been processed.
Going through video chunk 2...
The first 300000 videos have been processed.
Going through video chunk 3...
The first 400000 videos have been processed.
Going through video chunk 4...
The first 500000 videos have been processed.
Going through video chunk 5...
The first 600000 videos have been processed.
Going through video chunk 6...
The first 700000 videos have been processed.
Going through video chunk 7...
The first 800000 videos have been processed.
Going through video chunk 8...
The first 900000 videos have been processed.
Going through video chunk 9...
The first 1000000 videos have been processed.
Going through video chunk 10...
The first 1100000 videos have been processed.
Going through video chunk 11...
The first 1200000 videos have been processed.
Going through video chunk 12...
The first 1300000 videos have been processed.
Going

In [15]:
filtered_comments_test.head()

Unnamed: 0,author,video_id,likes,replies
