# Imports

In [83]:
import pandas as pd
import numpy as np

import time


import data_processing as dp  # own functions and logic will

# Set path to data files

In [84]:
# define the path to the folder where the YouNiverse dataset is stored here

# when adding your own path, don't remove the existing path, just comment it
# in this way, everyone can quickly uncomment their own path
dataset_root_path = "/media/andreas/Backup Plus/youniverse_dataset/"

In [85]:
# This is only needed if we want to test something on the dataset without using chunks. Otherwise, keep commented


# # load channel data
# df_channels = pd.read_csv(dataset_root_path + "df_channels_en.tsv.gz", compression="infer", sep="\t")

# # load (first 100000 rows of) video data
# df_videos = pd.read_json(dataset_root_path + "yt_metadata_en.jsonl.gz", compression="infer", lines=True, nrows=100000)

# # load (first 1000000 rows of) comment data
# df_comments = pd.read_csv(dataset_root_path + "youtube_comments.tsv.gz", compression="infer", sep="\t", nrows=1000000)

In [86]:
# definition of "readers", i.e., objects that we can iterate through 
# and always get a chunk of the dataframe in each iteration

def videos_in_chunks(chunksize: int = 100000):
    return pd.read_json(dataset_root_path + "yt_metadata_en.jsonl.gz", 
                        compression="infer", lines=True, chunksize=chunksize, )  
                        # nrows=1000000, )   # uncomment this to only use the first million videos, for testing
                                             # (remove the paranthesis above as well)
def comments_in_chunks(chunksize: int = 1000000):
    return pd.read_csv(dataset_root_path + "youtube_comments.tsv.gz", 
                       compression="infer", sep="\t", chunksize=chunksize, )
                       # nrows = 10000000)  # uncomment this to only use the first 10 million comments, for testing
                                            # (remove the paranthesis above as well)

In [87]:
def filter_vid_by_channel(channel_id: str, video_chunksize: int = 100000) -> pd.DataFrame :
    """
    Returns a dataframe which is the video metadata dataframe, filtered so that only videos from the 
    given channel remain.

    The function does this by going through the dataset in chunks of a specified size.

    Args:
        channel_id :  id of the channel which the videos will be sorted by
        video_chunksize :  number of entries in each chunk. Default is 100 000.

    Returns:
        The filtered dataframe
    """

    videos_filtered = pd.DataFrame(dict())
    
    with videos_in_chunks(video_chunksize) as reader:
        time_start_global = time.time()
        i=0
        for video_chunk in reader:
            time_start_chunk = time.time()
            print(f"Going through video chunk {i}...")
            videos_filtered = pd.concat([videos_filtered, video_chunk.loc[video_chunk.channel_id == channel_id]])
            print(f"The first {(i+1) * video_chunksize} videos have been processed.")
            time_end = time.time()
            print(f"{(time_end-time_start_global)/(i+1):.3f} secs per chunk on average. Meaning {72924794 / video_chunksize * (time_end-time_start_global)/((i+1)*60):.3f} mins for entire dataset, i.e., {72924794 / video_chunksize * (time_end-time_start_global)/((i+1)*60) - (time_end-time_start_global)/60:.3f} mins left.")
            i=i+1
    return videos_filtered

def filter_comment_by_channel(channel_id: str,video_chunksize: int = 100000, comment_chunksize: int = 1000000) -> pd.DataFrame :
    """
    Returns a dataframe which is the comment data, but filtered so that only comments
    made on videos which were published by a certain channel are left.

    The function does this by going through the dataset in chunks of a defined size.

    Args:
        channel_id: id of the channel to be filtered by
        video_chunksize: number of entries per chunk when going through the videos (to find the videos uploaded by a certain channel)
        comment_chunksize: number of entries per chunk when going through the comments

    Returns:
        The filtered dataframe
    """

    comments_filtered = pd.DataFrame(dict())
    filtered_videos = filter_vid_by_channel(channel_id, video_chunksize=video_chunksize)
    print(f"Videos have been filtered by channel, {len(filtered_videos)} videos found. \nNow going through comments....")
    with comments_in_chunks(comment_chunksize) as reader:
        time_start_global = time.time()
        for i, comment_chunk in enumerate(reader):
            print(f"Going through comment chunk {i}...")
            comments_filtered = pd.concat([comments_filtered, 
                                comment_chunk.loc[comment_chunk.video_id.isin(filtered_videos.display_id)]])
            print(f"The first {(i+1) * comment_chunksize} comments have been processed")
            time_end = time.time()
            print(f"{(time_end-time_start_global)/(i+1):.3f} secs per chunk on average.Meaning {8600000000 / comment_chunksize * (time_end-time_start_global)/((i+1)*60):.3f} mins for entire dataset, i.e., {8600000000 / comment_chunksize * (time_end-time_start_global)/((i+1)*60) - (time_end-time_start_global)/60:.3f} mins left.")
    return comments_filtered

In [88]:
# test the above functions by searching for comments from a certain channel
filtered_comments_test = filter_comment_by_channel("UCzWrhkg9eK5I8Bm3HfV-unA", video_chunksize=100000, comment_chunksize=1000000)

Going through video chunk 0...
The first 100000 videos have been processed.
2.307 secs per chunk on average. Meaning 28.041 mins for entire dataset, i.e., 28.003 mins left.
Going through video chunk 1...
The first 200000 videos have been processed.
3.337 secs per chunk on average. Meaning 40.560 mins for entire dataset, i.e., 40.449 mins left.
Going through video chunk 2...
The first 300000 videos have been processed.
2.968 secs per chunk on average. Meaning 36.068 mins for entire dataset, i.e., 35.919 mins left.


KeyboardInterrupt: 

In [None]:
filtered_comments_test

Unnamed: 0,author,video_id,likes,replies
132678,9352,qr9Hm1pTZKA,6,0
895322,58129,zj5TOsMZ-a4,0,0
6868268,453667,3vQK78eUg2A,2,1
7094579,468696,SWZG-ba1qDk,15,18
8912192,594074,hn2zYwqSINY,0,1
...,...,...,...,...
8593459329,575741522,kYkokQgnu20,6,8
8594971150,575837890,qr9Hm1pTZKA,1,0
8595593308,575877790,ObYU8s2psvQ,0,0
8604498138,576474588,ikPgqOPXiAw,0,0
