# Imports

In [189]:
import pandas as pd
import numpy as np

import time
import importlib

import data_processing as dp  # own functions and logic
importlib.reload(dp)

<module 'data_processing' from '/home/andreas/Nextcloud/Dokumente/Uni/Module/3sem-EPFL/ada/Project/ada-2024-project-thedataminions/preprocessing_tests/data_processing.py'>

# Set path to data files

In [179]:
# define the path to the folder where the YouNiverse dataset is stored here

# when adding your own path, don't remove the existing path, just comment it
# in this way, everyone can quickly uncomment their own path
dataset_root_path = "/media/andreas/Backup Plus/youniverse_dataset/"

In [180]:
# This is only needed if we want to test something on the dataset without using chunks. Otherwise, keep commented


# load channel data
df_channels = pd.read_csv(dataset_root_path + "df_channels_en.tsv.gz", compression="infer", sep="\t")

# load (first 100000 rows of) video data
df_videos = pd.read_json(dataset_root_path + "yt_metadata_en.jsonl.gz", compression="infer", lines=True, nrows=100000)

# load (first 1000000 rows of) comment data
df_comments = pd.read_csv(dataset_root_path + "youtube_comments.tsv.gz", compression="infer", sep="\t", nrows=1000000)

In [None]:
# definition of "readers", i.e., objects that we can iterate through 
# and always get a chunk of the dataframe in each iteration

def videos_in_chunks(chunksize: int = 100000):
    return pd.read_json(dataset_root_path + "yt_metadata_en.jsonl.gz", 
                        compression="infer", lines=True, chunksize=chunksize, )  
                        # nrows=1000000, )   # uncomment this to only use the first million videos, for testing
                                             # (remove the paranthesis above as well)
def comments_in_chunks(chunksize: int = 1000000):
    return pd.read_csv(dataset_root_path + "youtube_comments.tsv.gz", 
                       compression="infer", sep="\t", chunksize=chunksize, )
                       # nrows = 10000000)  # uncomment this to only use the first 10 million comments, for testing
                                            # (remove the paranthesis above as well)

In [182]:
def filter_vid_by_channel(channel_id: str, video_chunksize: int = 100000) -> pd.DataFrame :
    """
    Returns a dataframe which is the video metadata dataframe, filtered so that only videos from the 
    given channel remain.

    The function does this by going through the dataset in chunks of a specified size.

    Args:
        channel_id :  id of the channel which the videos will be sorted by
        video_chunksize :  number of entries in each chunk. Default is 100 000.

    Returns:
        The filtered dataframe
    """

    videos_filtered = pd.DataFrame(dict())
    
    with videos_in_chunks(video_chunksize) as reader:
        time_start_global = time.time()
        i=0
        for video_chunk in reader:
            time_start_chunk = time.time()
            print(f"Going through video chunk {i}...")
            videos_filtered = pd.concat([videos_filtered, video_chunk.loc[video_chunk.channel_id == channel_id]])
            print(f"The first {(i+1) * video_chunksize} videos have been processed.")
            time_end = time.time()
            print(f"{(time_end-time_start_global)/(i+1):.3f} secs per chunk on average. Meaning {72924794 / video_chunksize * (time_end-time_start_global)/((i+1)*60):.3f} mins for entire dataset, i.e., {72924794 / video_chunksize * (time_end-time_start_global)/((i+1)*60) - (time_end-time_start_global)/60:.3f} mins left.")
            i=i+1
    return videos_filtered

def filter_comment_by_channel(channel_id: str,video_chunksize: int = 100000, comment_chunksize: int = 1000000) -> pd.DataFrame :
    """
    Returns a dataframe which is the comment data, but filtered so that only comments
    made on videos which were published by a certain channel are left.

    The function does this by going through the dataset in chunks of a defined size.

    Args:
        channel_id: id of the channel to be filtered by
        video_chunksize: number of entries per chunk when going through the videos (to find the videos uploaded by a certain channel)
        comment_chunksize: number of entries per chunk when going through the comments

    Returns:
        The filtered dataframe
    """

    comments_filtered = pd.DataFrame(dict())
    filtered_videos = filter_vid_by_channel(channel_id, video_chunksize=video_chunksize)
    print(f"Videos have been filtered by channel, {len(filtered_videos)} videos found. \nNow going through comments....")
    with comments_in_chunks(comment_chunksize) as reader:
        time_start_global = time.time()
        for i, comment_chunk in enumerate(reader):
            print(f"Going through comment chunk {i}...")
            comments_filtered = pd.concat([comments_filtered, 
                                comment_chunk.loc[comment_chunk.video_id.isin(filtered_videos.display_id)]])
            print(f"The first {(i+1) * comment_chunksize} comments have been processed")
            time_end = time.time()
            print(f"{(time_end-time_start_global)/(i+1):.3f} secs per chunk on average.Meaning {8600000000 / comment_chunksize * (time_end-time_start_global)/((i+1)*60):.3f} mins for entire dataset, i.e., {8600000000 / comment_chunksize * (time_end-time_start_global)/((i+1)*60) - (time_end-time_start_global)/60:.3f} mins left.")
    return comments_filtered

In [183]:
# test the above functions by searching for comments from a certain channel
# filtered_comments_test = filter_comment_by_channel("UCzWrhkg9eK5I8Bm3HfV-unA", video_chunksize=100000, comment_chunksize=1000000)

In [184]:
# filtered_comments_test

In [None]:
nans = dp.run_simple_function_on_chunks(comments_in_chunks(), 
                                        lambda x: dp.get_na_entries(x, "any", False),
                                        print_time=(1000000, 8600000000))



Going through chunk 0...
The first 1000000 entries have been processed. 8599000000 left.
0.677 secs per chunk on average. Meaning  97.075 minutes left.
Going through chunk 1...
The first 2000000 entries have been processed. 8598000000 left.
0.726 secs per chunk on average. Meaning  104.056 minutes left.
Going through chunk 2...
The first 3000000 entries have been processed. 8597000000 left.
0.748 secs per chunk on average. Meaning  107.159 minutes left.
Going through chunk 3...
The first 4000000 entries have been processed. 8596000000 left.
1.065 secs per chunk on average. Meaning  152.523 minutes left.
Going through chunk 4...
The first 5000000 entries have been processed. 8595000000 left.
1.354 secs per chunk on average. Meaning  193.941 minutes left.
Going through chunk 5...
The first 6000000 entries have been processed. 8594000000 left.
1.783 secs per chunk on average. Meaning  255.335 minutes left.
Going through chunk 6...
The first 7000000 entries have been processed. 8593000000 

In [186]:
display(nans)

Unnamed: 0,author,video_id,likes,replies
0,1,Gkb1QMHrGvA,2,0
1,1,CNtp0xqoods,0,0
2,1,249EEzQmVmQ,1,0
3,1,_U443T2K_Bs,0,0
4,1,rJbjhm0weYc,0,0
...,...,...,...,...
9999995,664459,GC3gqIbrK7c,9,1
9999996,664459,GC3gqIbrK7c,1,0
9999997,664459,i9VRGaoFw8k,1,1
9999998,664459,-JLWZ1jz3FY,0,2


In [192]:
counted_nans = dp.run_simple_function_on_chunks(comments_in_chunks(), 
                                                lambda x: dp.count_na_entries(x, "any", False),
                                                print_time=(1000000, 8600000000)).sum(axis=0)



Going through chunk 0...
The first 1000000 entries have been processed. 8599000000 left.
0.630 secs per chunk on average. Meaning  90.327 minutes left.
Going through chunk 1...
The first 2000000 entries have been processed. 8598000000 left.
0.647 secs per chunk on average. Meaning  92.675 minutes left.
Going through chunk 2...
The first 3000000 entries have been processed. 8597000000 left.
0.652 secs per chunk on average. Meaning  93.408 minutes left.
Going through chunk 3...
The first 4000000 entries have been processed. 8596000000 left.
0.655 secs per chunk on average. Meaning  93.778 minutes left.
Going through chunk 4...
The first 5000000 entries have been processed. 8595000000 left.
0.653 secs per chunk on average. Meaning  93.605 minutes left.
Going through chunk 5...
The first 6000000 entries have been processed. 8594000000 left.
0.652 secs per chunk on average. Meaning  93.418 minutes left.
Going through chunk 6...
The first 7000000 entries have been processed. 8593000000 left.

In [None]:
display(counted_nans)

na rows              0
total rows    10000000
dtype: int64