# Imports

In [189]:
import pandas as pd
import numpy as np

import time
import importlib

import data_processing as dp  # own functions and logic
importlib.reload(dp)

<module 'data_processing' from '/home/andreas/Nextcloud/Dokumente/Uni/Module/3sem-EPFL/ada/Project/ada-2024-project-thedataminions/preprocessing_tests/data_processing.py'>

# Set path to data files

In [179]:
# define the path to the folder where the YouNiverse dataset is stored here

# when adding your own path, don't remove the existing path, just comment it
# in this way, everyone can quickly uncomment their own path
dataset_root_path = "/media/andreas/Backup Plus/youniverse_dataset/"

In [180]:
# This is only needed if we want to test something on the dataset without using chunks. Otherwise, keep commented


# load channel data
df_channels = pd.read_csv(dataset_root_path + "df_channels_en.tsv.gz", compression="infer", sep="\t")

# load (first 100000 rows of) video data
df_videos = pd.read_json(dataset_root_path + "yt_metadata_en.jsonl.gz", compression="infer", lines=True, nrows=100000)

# load (first 1000000 rows of) comment data
df_comments = pd.read_csv(dataset_root_path + "youtube_comments.tsv.gz", compression="infer", sep="\t", nrows=1000000)

In [203]:
df_channels

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.0870
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.0870
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.0870
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.0870
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.0870
...,...,...,...,...,...,...,...,...
136465,Music,2016-10-06,UCuM-9AajUOwKw6ipOzu2DRQ,GONE.Fludd - Topic,10128,105,1008139.0,53.1435
136466,People & Blogs,2013-10-17,UCtW9jp5TH0YrgYpwiRf9t-Q,saidthestory,10100,352,1008644.0,53.1435
136467,Gaming,2015-05-08,UCTsxFTIUs8vFDzGccDm6i7Q,Omni H,10000,475,1009505.0,53.1435
136468,Music,2011-04-13,UC1HOArgRCMGPjlcmkThERwA,TĀLĀ,10000,15,1025119.0,53.1435


In [195]:
df_videos.categories.value_counts()

categories
Entertainment            22685
Gaming                   21057
Music                    13442
People & Blogs            8910
News & Politics           7347
Howto & Style             4857
Travel & Events           4746
Science & Technology      4340
Comedy                    4197
Education                 3326
Sports                    1270
Film & Animation          1225
Pets & Animals            1128
Autos & Vehicles           738
Nonprofits & Activism      730
                             2
Name: count, dtype: int64

In [None]:
# definition of "readers", i.e., objects that we can iterate through 
# and always get a chunk of the dataframe in each iteration

def videos_in_chunks(chunksize: int = 100000):
    return pd.read_json(dataset_root_path + "yt_metadata_en.jsonl.gz", 
                        compression="infer", lines=True, chunksize=chunksize, )  
                        # nrows=1000000, )   # uncomment this to only use the first million videos, for testing
                                             # (remove the paranthesis above as well)
def comments_in_chunks(chunksize: int = 1000000):
    return pd.read_csv(dataset_root_path + "youtube_comments.tsv.gz", 
                       compression="infer", sep="\t", chunksize=chunksize, )
                       # nrows = 10000000)  # uncomment this to only use the first 10 million comments, for testing
                                            # (remove the paranthesis above as well)

In [182]:
def filter_vid_by_channel(channel_id: str, video_chunksize: int = 100000) -> pd.DataFrame :
    """
    Returns a dataframe which is the video metadata dataframe, filtered so that only videos from the 
    given channel remain.

    The function does this by going through the dataset in chunks of a specified size.

    Args:
        channel_id :  id of the channel which the videos will be sorted by
        video_chunksize :  number of entries in each chunk. Default is 100 000.

    Returns:
        The filtered dataframe
    """

    videos_filtered = pd.DataFrame(dict())
    
    with videos_in_chunks(video_chunksize) as reader:
        time_start_global = time.time()
        i=0
        for video_chunk in reader:
            time_start_chunk = time.time()
            print(f"Going through video chunk {i}...")
            videos_filtered = pd.concat([videos_filtered, video_chunk.loc[video_chunk.channel_id == channel_id]])
            print(f"The first {(i+1) * video_chunksize} videos have been processed.")
            time_end = time.time()
            print(f"{(time_end-time_start_global)/(i+1):.3f} secs per chunk on average. Meaning {72924794 / video_chunksize * (time_end-time_start_global)/((i+1)*60):.3f} mins for entire dataset, i.e., {72924794 / video_chunksize * (time_end-time_start_global)/((i+1)*60) - (time_end-time_start_global)/60:.3f} mins left.")
            i=i+1
    return videos_filtered

def filter_comment_by_channel(channel_id: str,video_chunksize: int = 100000, comment_chunksize: int = 1000000) -> pd.DataFrame :
    """
    Returns a dataframe which is the comment data, but filtered so that only comments
    made on videos which were published by a certain channel are left.

    The function does this by going through the dataset in chunks of a defined size.

    Args:
        channel_id: id of the channel to be filtered by
        video_chunksize: number of entries per chunk when going through the videos (to find the videos uploaded by a certain channel)
        comment_chunksize: number of entries per chunk when going through the comments

    Returns:
        The filtered dataframe
    """

    comments_filtered = pd.DataFrame(dict())
    filtered_videos = filter_vid_by_channel(channel_id, video_chunksize=video_chunksize)
    print(f"Videos have been filtered by channel, {len(filtered_videos)} videos found. \nNow going through comments....")
    with comments_in_chunks(comment_chunksize) as reader:
        time_start_global = time.time()
        for i, comment_chunk in enumerate(reader):
            print(f"Going through comment chunk {i}...")
            comments_filtered = pd.concat([comments_filtered, 
                                comment_chunk.loc[comment_chunk.video_id.isin(filtered_videos.display_id)]])
            print(f"The first {(i+1) * comment_chunksize} comments have been processed")
            time_end = time.time()
            print(f"{(time_end-time_start_global)/(i+1):.3f} secs per chunk on average.Meaning {8600000000 / comment_chunksize * (time_end-time_start_global)/((i+1)*60):.3f} mins for entire dataset, i.e., {8600000000 / comment_chunksize * (time_end-time_start_global)/((i+1)*60) - (time_end-time_start_global)/60:.3f} mins left.")
    return comments_filtered

In [183]:
# test the above functions by searching for comments from a certain channel
# filtered_comments_test = filter_comment_by_channel("UCzWrhkg9eK5I8Bm3HfV-unA", video_chunksize=100000, comment_chunksize=1000000)

In [184]:
# filtered_comments_test

In [None]:
# get the entries of the comment dataframe which have a na value in any column
nans = dp.run_simple_function_on_chunks(comments_in_chunks(), 
                                        lambda x: dp.get_na_entries(x, "any", False),
                                        print_time=(1000000, 8600000000))



Going through chunk 0...
The first 1000000 entries have been processed. 8599000000 left.
0.677 secs per chunk on average. Meaning  97.075 minutes left.
Going through chunk 1...
The first 2000000 entries have been processed. 8598000000 left.
0.726 secs per chunk on average. Meaning  104.056 minutes left.
Going through chunk 2...
The first 3000000 entries have been processed. 8597000000 left.
0.748 secs per chunk on average. Meaning  107.159 minutes left.
Going through chunk 3...
The first 4000000 entries have been processed. 8596000000 left.
1.065 secs per chunk on average. Meaning  152.523 minutes left.
Going through chunk 4...
The first 5000000 entries have been processed. 8595000000 left.
1.354 secs per chunk on average. Meaning  193.941 minutes left.
Going through chunk 5...
The first 6000000 entries have been processed. 8594000000 left.
1.783 secs per chunk on average. Meaning  255.335 minutes left.
Going through chunk 6...
The first 7000000 entries have been processed. 8593000000 

In [186]:
display(nans)

Unnamed: 0,author,video_id,likes,replies
0,1,Gkb1QMHrGvA,2,0
1,1,CNtp0xqoods,0,0
2,1,249EEzQmVmQ,1,0
3,1,_U443T2K_Bs,0,0
4,1,rJbjhm0weYc,0,0
...,...,...,...,...
9999995,664459,GC3gqIbrK7c,9,1
9999996,664459,GC3gqIbrK7c,1,0
9999997,664459,i9VRGaoFw8k,1,1
9999998,664459,-JLWZ1jz3FY,0,2


In [None]:
# count the entries of the comment dataframe which have a na value in any column
counted_nans = dp.run_simple_function_on_chunks(comments_in_chunks(), 
                                                lambda x: dp.count_na_entries(x, "any", False),
                                                print_time=(1000000, 8600000000)).sum(axis=0)



Going through chunk 0...
The first 1000000 entries have been processed. 8599000000 left.
0.630 secs per chunk on average. Meaning  90.327 minutes left.
Going through chunk 1...
The first 2000000 entries have been processed. 8598000000 left.
0.647 secs per chunk on average. Meaning  92.675 minutes left.
Going through chunk 2...
The first 3000000 entries have been processed. 8597000000 left.
0.652 secs per chunk on average. Meaning  93.408 minutes left.
Going through chunk 3...
The first 4000000 entries have been processed. 8596000000 left.
0.655 secs per chunk on average. Meaning  93.778 minutes left.
Going through chunk 4...
The first 5000000 entries have been processed. 8595000000 left.
0.653 secs per chunk on average. Meaning  93.605 minutes left.
Going through chunk 5...
The first 6000000 entries have been processed. 8594000000 left.
0.652 secs per chunk on average. Meaning  93.418 minutes left.
Going through chunk 6...
The first 7000000 entries have been processed. 8593000000 left.

In [194]:
display(counted_nans)

na rows              0
total rows    10000000
dtype: int64

In [200]:
# filter the video dataframe to only include videos from news and politics category

df_videos_news_pol = dp.run_simple_function_on_chunks(videos_in_chunks(chunksize=100000), 
                                                      lambda x: x[x.categories == "News & Politics"], 
                                                      print_time=(100000, 72924794))

Going through chunk 0...
The first 100000 entries have been processed. 72824794 left.
5.211 secs per chunk on average. Meaning  63.244 minutes left.
Going through chunk 1...
The first 200000 entries have been processed. 72724794 left.
3.801 secs per chunk on average. Meaning  46.075 minutes left.
Going through chunk 2...
The first 300000 entries have been processed. 72624794 left.
3.651 secs per chunk on average. Meaning  44.187 minutes left.
Going through chunk 3...
The first 400000 entries have been processed. 72524794 left.
3.300 secs per chunk on average. Meaning  39.889 minutes left.
Going through chunk 4...
The first 500000 entries have been processed. 72424794 left.
3.056 secs per chunk on average. Meaning  36.894 minutes left.
Going through chunk 5...
The first 600000 entries have been processed. 72324794 left.
2.854 secs per chunk on average. Meaning  34.408 minutes left.
Going through chunk 6...
The first 700000 entries have been processed. 72224794 left.
2.727 secs per chunk

In [204]:
df_channels_news_pol = df_channels[df_channels.category_cc == "News & Politics"]

In [207]:
display(df_videos_news_pol)

Unnamed: 0,categories,channel_id,crawl_date,description,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count
1827,News & Politics,UCzWm1-4XF7AHxVUTkHCM1uw,2019-11-17 06:28:42.593675,retrogamer3.com,16.0,dfa8RRkKoa4,9251,25.0,"RetroGamer3,Live Stream,politics,Trump",Retrogamer3 Political Stream,2018-08-23 00:00:00,478.0
7605,News & Politics,UCzWLsxDD373D4tY8kN-0LGQ,2019-11-05 00:42:33.012228,What are the forces at work that have created ...,0.0,_dIIEMvH86k,309,9.0,"NWO,Ebola,Ukraine,Mainstream,Media,Pyschology",Adam Curtis describes the Surkow Strategy of M...,2015-01-04 00:00:00,865.0
18005,News & Politics,UCzVBu6oqlrAix0oq9T2rBFg,2019-11-19 20:40:22.403775,Social Media:\n\nFacebook.com/thebookoflaura\n...,89.0,eWXefhNB2po,707,625.0,"michael jackson,lyrics,music video,court,child...",my thoughts on the michael jackson documentary.,2019-04-24 00:00:00,12780.0
24361,News & Politics,UCzUV5283-l5c0oKRtyenj6Q,2019-11-22 08:47:10.520209,👕 Order your shirts here: https://Teespring.co...,195.0,MBgzne7djFU,378,47027.0,"Funny,Entertainment,Fun,Laughing,Educational,L...",Elizabeth Warren Gets a Big Surprise at the Ai...,2019-10-03 00:00:00,374711.0
24362,News & Politics,UCzUV5283-l5c0oKRtyenj6Q,2019-11-22 08:46:16.481889,👕 Order your shirts here: https://Teespring.co...,114.0,AbH3pJnFgY8,278,36384.0,"Funny,Entertainment,Fun,Laughing,Educational,L...",No More Twitter? 😂,2019-10-02 00:00:00,245617.0
...,...,...,...,...,...,...,...,...,...,...,...,...
999870,News & Politics,UCrUkx0UAxgybbbMvWphd62Q,2019-11-10 14:27:26.687460,The Young Turks recently posted a video entitl...,2.0,Rmq0JmUbt8k,857,25.0,"American Joe,American Joe Show,The Young Turks...",Young Turks Caught Lying and Race Baiting.... ...,2018-11-17 00:00:00,273.0
999871,News & Politics,UCrUkx0UAxgybbbMvWphd62Q,2019-11-10 14:27:27.273595,Patriots I need your help growing the American...,0.0,ts__Orp310M,49,34.0,"American Joe,American Joe Show",President says he will send migrant Children B...,2018-11-15 00:00:00,353.0
999872,News & Politics,UCrUkx0UAxgybbbMvWphd62Q,2019-11-10 14:27:27.847348,Patriots I need your help growing the American...,1.0,bQ3_ZMVpiio,298,6.0,"American Joe,American Joe Show,Michael Avenatt...","Creepy Porn Lawyer, and Woman Beater Michael A...",2018-11-14 00:00:00,76.0
999873,News & Politics,UCrUkx0UAxgybbbMvWphd62Q,2019-11-10 14:27:28.400609,Patriots I need your help growing the American...,2.0,q92A939Nyj8,388,2.0,"American Joe,American Joe Show,Midterm Electio...",Midterm Fallout - How Bad is it For Trump?,2018-11-14 00:00:00,38.0


In [206]:
display(df_channels_news_pol)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
129,News & Politics,2006-08-26,UCttspZesZIDEwwpVIgoZtWQ,IndiaTV,15177282,139814,199.0,2.0870
133,News & Politics,2012-06-01,UCRWFSbif-RFENbBrSiez1DA,ABP NEWS,16274836,129027,207.0,2.0870
212,News & Politics,2017-03-03,UCmphdqZNmqL72WJ2uyiNw5w,ABP NEWS HINDI,10800000,51298,340.0,2.0870
268,News & Politics,2015-03-23,UCx8Z14PpntdaxCt2hakbQLQ,The Lallantop,9120000,9423,438.0,2.0870
337,News & Politics,2007-06-19,UCIvaYmXn910QMdemBG3v1pQ,Zee News,9280000,102648,549.0,2.0870
...,...,...,...,...,...,...,...,...
135820,News & Politics,2010-08-07,UC5rxiCGcNunIi5zI1hMYLMg,Salman Akhtar,10400,40,962468.0,53.1435
135825,News & Politics,2013-02-01,UCLSEJQ8TWtlEkaytaa4Y7lw,WingsOfChrist,10420,61,962547.0,53.1435
135901,News & Politics,2012-10-19,UCnkG_c5cyemVVsgCDoHiXew,The American Mirror,10500,329,963417.0,53.1435
136231,News & Politics,2017-11-25,UC69lWS7UMbBQc-9yqp4nGjA,Patriotism Show,10320,46,975448.0,53.1435
