# Imports

In [265]:
import pandas as pd
import numpy as np

import time
import importlib

import data_processing as dp  # own functions and logic
importlib.reload(dp)

<module 'data_processing' from '/home/andreas/Nextcloud/Dokumente/Uni/Module/3sem-EPFL/ada/Project/ada-2024-project-thedataminions/preprocessing_tests/data_processing.py'>

# Set path to data files

In [179]:
# define the path to the folder where the YouNiverse dataset is stored here

# when adding your own path, don't remove the existing path, just comment it
# in this way, everyone can quickly uncomment their own path
dataset_root_path = "/media/andreas/Backup Plus/youniverse_dataset/"

In [None]:
# load channel data
df_channels = pd.read_csv(dataset_root_path + "df_channels_en.tsv.gz", compression="infer", sep="\t")


In [259]:
# This is only needed if we want to test something on the dataset without using chunks. Otherwise, keep commented

# load (first 100000 rows of) video data
df_videos = pd.read_json(dataset_root_path + "yt_metadata_en.jsonl.gz", compression="infer", lines=True, nrows=100000)

# load (first 1000000 rows of) comment data
df_comments = pd.read_csv(dataset_root_path + "youtube_comments.tsv.gz", compression="infer", sep="\t", nrows=1000000)

In [203]:
df_channels

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.0870
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.0870
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.0870
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.0870
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.0870
...,...,...,...,...,...,...,...,...
136465,Music,2016-10-06,UCuM-9AajUOwKw6ipOzu2DRQ,GONE.Fludd - Topic,10128,105,1008139.0,53.1435
136466,People & Blogs,2013-10-17,UCtW9jp5TH0YrgYpwiRf9t-Q,saidthestory,10100,352,1008644.0,53.1435
136467,Gaming,2015-05-08,UCTsxFTIUs8vFDzGccDm6i7Q,Omni H,10000,475,1009505.0,53.1435
136468,Music,2011-04-13,UC1HOArgRCMGPjlcmkThERwA,TĀLĀ,10000,15,1025119.0,53.1435


In [None]:
# definition of "readers", i.e., objects that we can iterate through 
# and always get a chunk of the dataframe in each iteration

def videos_in_chunks(chunksize: int = 100000):
    return pd.read_json(dataset_root_path + "yt_metadata_en.jsonl.gz", 
                        compression="infer", lines=True, chunksize=chunksize, 
                        nrows=1000000, )   # uncomment this to only use the first million videos, for testing
                                             # (remove the paranthesis above as well)

def comments_in_chunks(chunksize: int = 1000000):
    return pd.read_csv(dataset_root_path + "youtube_comments.tsv.gz", 
                       compression="infer", sep="\t", chunksize=chunksize, 
                       nrows = 10000000)  # uncomment this to only use the first 10 million comments, for testing
                                            # (remove the paranthesis above as well)

In [182]:
def filter_vid_by_channel(channel_id: str, video_chunksize: int = 100000) -> pd.DataFrame :
    """
    Returns a dataframe which is the video metadata dataframe, filtered so that only videos from the 
    given channel remain.

    The function does this by going through the dataset in chunks of a specified size.

    Args:
        channel_id :  id of the channel which the videos will be sorted by
        video_chunksize :  number of entries in each chunk. Default is 100 000.

    Returns:
        The filtered dataframe
    """

    videos_filtered = pd.DataFrame(dict())
    
    with videos_in_chunks(video_chunksize) as reader:
        time_start_global = time.time()
        i=0
        for video_chunk in reader:
            time_start_chunk = time.time()
            print(f"Going through video chunk {i}...")
            videos_filtered = pd.concat([videos_filtered, video_chunk.loc[video_chunk.channel_id == channel_id]])
            print(f"The first {(i+1) * video_chunksize} videos have been processed.")
            time_end = time.time()
            print(f"{(time_end-time_start_global)/(i+1):.3f} secs per chunk on average. Meaning {72924794 / video_chunksize * (time_end-time_start_global)/((i+1)*60):.3f} mins for entire dataset, i.e., {72924794 / video_chunksize * (time_end-time_start_global)/((i+1)*60) - (time_end-time_start_global)/60:.3f} mins left.")
            i=i+1
    return videos_filtered

def filter_comment_by_channel(channel_id: str,video_chunksize: int = 100000, comment_chunksize: int = 1000000) -> pd.DataFrame :
    """
    Returns a dataframe which is the comment data, but filtered so that only comments
    made on videos which were published by a certain channel are left.

    The function does this by going through the dataset in chunks of a defined size.

    Args:
        channel_id: id of the channel to be filtered by
        video_chunksize: number of entries per chunk when going through the videos (to find the videos uploaded by a certain channel)
        comment_chunksize: number of entries per chunk when going through the comments

    Returns:
        The filtered dataframe
    """

    comments_filtered = pd.DataFrame(dict())
    filtered_videos = filter_vid_by_channel(channel_id, video_chunksize=video_chunksize)
    print(f"Videos have been filtered by channel, {len(filtered_videos)} videos found. \nNow going through comments....")
    with comments_in_chunks(comment_chunksize) as reader:
        time_start_global = time.time()
        for i, comment_chunk in enumerate(reader):
            print(f"Going through comment chunk {i}...")
            comments_filtered = pd.concat([comments_filtered, 
                                comment_chunk.loc[comment_chunk.video_id.isin(filtered_videos.display_id)]])
            print(f"The first {(i+1) * comment_chunksize} comments have been processed")
            time_end = time.time()
            print(f"{(time_end-time_start_global)/(i+1):.3f} secs per chunk on average.Meaning {8600000000 / comment_chunksize * (time_end-time_start_global)/((i+1)*60):.3f} mins for entire dataset, i.e., {8600000000 / comment_chunksize * (time_end-time_start_global)/((i+1)*60) - (time_end-time_start_global)/60:.3f} mins left.")
    return comments_filtered

In [261]:
# test the above functions by searching for comments from a certain channel
filtered_comments_test = filter_comment_by_channel("UCzWrhkg9eK5I8Bm3HfV-unA", video_chunksize=100000, comment_chunksize=1000000)

Going through video chunk 0...
The first 100000 videos have been processed.
9.979 secs per chunk on average. Meaning 121.287 mins for entire dataset, i.e., 121.121 mins left.
Going through video chunk 1...
The first 200000 videos have been processed.
8.103 secs per chunk on average. Meaning 98.479 mins for entire dataset, i.e., 98.209 mins left.
Going through video chunk 2...
The first 300000 videos have been processed.
7.223 secs per chunk on average. Meaning 87.790 mins for entire dataset, i.e., 87.429 mins left.
Going through video chunk 3...
The first 400000 videos have been processed.
6.482 secs per chunk on average. Meaning 78.782 mins for entire dataset, i.e., 78.350 mins left.
Going through video chunk 4...
The first 500000 videos have been processed.
5.906 secs per chunk on average. Meaning 71.781 mins for entire dataset, i.e., 71.288 mins left.
Going through video chunk 5...
The first 600000 videos have been processed.
5.530 secs per chunk on average. Meaning 67.217 mins for 

KeyboardInterrupt: 

In [None]:
# filtered_comments_test

NameError: name 'filtered_comments_test' is not defined

# Todo: next step

make a function which takes dataframes such as the above "filtered_comments_test" (df with comment data from videos only from one specific channel),
and returns the number of comments by each user id
( filtered_comments_test.groupby("user_id").agg("sum") )

sort so that only users with x number of comments remain

(flexible function where you give the threshold)


# Todo after that

create dataset of all comments which are under a video in the news and politics category

use this dataset to get the list of videos under which each of the users we found (above) have made a comment

for each pair of users, calculate "number of videos in common (under which both have commented) / min number of videos both users have commented on"
(example: mila commented on 10 videos, andreas on 100 videos, they have 8 videos they both commented on, so the value we calculate is 8/10 = 0.8)


In [267]:
# get the entries of the comment dataframe which have a na value in any column
nans = dp.run_simple_function_on_chunks_concat(comments_in_chunks(), 
                                        lambda x: dp.get_na_entries(x, "any", False),
                                        print_time=(1000000, 8600000000))



Going through chunk 0...
The first 1000000 entries have been processed. 8599000000 left.
1.133 secs per chunk on average. Meaning  162.410 minutes left.
Going through chunk 1...
The first 2000000 entries have been processed. 8598000000 left.
1.161 secs per chunk on average. Meaning  166.386 minutes left.
Going through chunk 2...
The first 3000000 entries have been processed. 8597000000 left.
1.188 secs per chunk on average. Meaning  170.172 minutes left.
Going through chunk 3...
The first 4000000 entries have been processed. 8596000000 left.
1.179 secs per chunk on average. Meaning  168.851 minutes left.
Going through chunk 4...
The first 5000000 entries have been processed. 8595000000 left.
1.175 secs per chunk on average. Meaning  168.353 minutes left.
Going through chunk 5...
The first 6000000 entries have been processed. 8594000000 left.
1.180 secs per chunk on average. Meaning  169.047 minutes left.
Going through chunk 6...
The first 7000000 entries have been processed. 8593000000

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [263]:
display(nans)

Unnamed: 0,author,video_id,likes,replies
0,1,Gkb1QMHrGvA,2,0
1,1,CNtp0xqoods,0,0
2,1,249EEzQmVmQ,1,0
3,1,_U443T2K_Bs,0,0
4,1,rJbjhm0weYc,0,0
...,...,...,...,...
9999995,664459,GC3gqIbrK7c,9,1
9999996,664459,GC3gqIbrK7c,1,0
9999997,664459,i9VRGaoFw8k,1,1
9999998,664459,-JLWZ1jz3FY,0,2


In [None]:
# count the entries of the comment dataframe which have a na value in any column
counted_nans = dp.run_simple_function_on_chunks_concat(comments_in_chunks(), 
                                                lambda x: dp.count_na_entries(x, "any", False),
                                                print_time=(1000000, 8600000000)).sum(axis=0)



Going through chunk 0...
The first 1000000 entries have been processed. 8599000000 left.
0.630 secs per chunk on average. Meaning  90.327 minutes left.
Going through chunk 1...
The first 2000000 entries have been processed. 8598000000 left.
0.647 secs per chunk on average. Meaning  92.675 minutes left.
Going through chunk 2...
The first 3000000 entries have been processed. 8597000000 left.
0.652 secs per chunk on average. Meaning  93.408 minutes left.
Going through chunk 3...
The first 4000000 entries have been processed. 8596000000 left.
0.655 secs per chunk on average. Meaning  93.778 minutes left.
Going through chunk 4...
The first 5000000 entries have been processed. 8595000000 left.
0.653 secs per chunk on average. Meaning  93.605 minutes left.
Going through chunk 5...
The first 6000000 entries have been processed. 8594000000 left.
0.652 secs per chunk on average. Meaning  93.418 minutes left.
Going through chunk 6...
The first 7000000 entries have been processed. 8593000000 left.

In [194]:
display(counted_nans)

na rows              0
total rows    10000000
dtype: int64

In [None]:
# filter the video dataframe to only include videos from news and politics category

df_videos_news_pol = dp.run_simple_function_on_chunks_concat(videos_in_chunks(chunksize=100000),
                                                             lambda x: x[x.categories == "News & Politics"], 
                                                             print_time=(100000, 72924794))

Going through chunk 0...
The first 100000 entries have been processed. 72824794 left.
5.211 secs per chunk on average. Meaning  63.244 minutes left.
Going through chunk 1...
The first 200000 entries have been processed. 72724794 left.
3.801 secs per chunk on average. Meaning  46.075 minutes left.
Going through chunk 2...
The first 300000 entries have been processed. 72624794 left.
3.651 secs per chunk on average. Meaning  44.187 minutes left.
Going through chunk 3...
The first 400000 entries have been processed. 72524794 left.
3.300 secs per chunk on average. Meaning  39.889 minutes left.
Going through chunk 4...
The first 500000 entries have been processed. 72424794 left.
3.056 secs per chunk on average. Meaning  36.894 minutes left.
Going through chunk 5...
The first 600000 entries have been processed. 72324794 left.
2.854 secs per chunk on average. Meaning  34.408 minutes left.
Going through chunk 6...
The first 700000 entries have been processed. 72224794 left.
2.727 secs per chunk

In [271]:
df_channels_news_pol = df_channels[df_channels.category_cc == "News & Politics"]

video 1   title blwe id 34547 comment_user 1234
video 1   title blwe id 34547 comment_user 1234
video 1   title blwe id 34547 comment_user 1235
vide
video 2



comment 1 video id 253465
comment 2 

In [207]:
display(df_videos_news_pol)

Unnamed: 0,categories,channel_id,crawl_date,description,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count
1827,News & Politics,UCzWm1-4XF7AHxVUTkHCM1uw,2019-11-17 06:28:42.593675,retrogamer3.com,16.0,dfa8RRkKoa4,9251,25.0,"RetroGamer3,Live Stream,politics,Trump",Retrogamer3 Political Stream,2018-08-23 00:00:00,478.0
7605,News & Politics,UCzWLsxDD373D4tY8kN-0LGQ,2019-11-05 00:42:33.012228,What are the forces at work that have created ...,0.0,_dIIEMvH86k,309,9.0,"NWO,Ebola,Ukraine,Mainstream,Media,Pyschology",Adam Curtis describes the Surkow Strategy of M...,2015-01-04 00:00:00,865.0
18005,News & Politics,UCzVBu6oqlrAix0oq9T2rBFg,2019-11-19 20:40:22.403775,Social Media:\n\nFacebook.com/thebookoflaura\n...,89.0,eWXefhNB2po,707,625.0,"michael jackson,lyrics,music video,court,child...",my thoughts on the michael jackson documentary.,2019-04-24 00:00:00,12780.0
24361,News & Politics,UCzUV5283-l5c0oKRtyenj6Q,2019-11-22 08:47:10.520209,👕 Order your shirts here: https://Teespring.co...,195.0,MBgzne7djFU,378,47027.0,"Funny,Entertainment,Fun,Laughing,Educational,L...",Elizabeth Warren Gets a Big Surprise at the Ai...,2019-10-03 00:00:00,374711.0
24362,News & Politics,UCzUV5283-l5c0oKRtyenj6Q,2019-11-22 08:46:16.481889,👕 Order your shirts here: https://Teespring.co...,114.0,AbH3pJnFgY8,278,36384.0,"Funny,Entertainment,Fun,Laughing,Educational,L...",No More Twitter? 😂,2019-10-02 00:00:00,245617.0
...,...,...,...,...,...,...,...,...,...,...,...,...
999870,News & Politics,UCrUkx0UAxgybbbMvWphd62Q,2019-11-10 14:27:26.687460,The Young Turks recently posted a video entitl...,2.0,Rmq0JmUbt8k,857,25.0,"American Joe,American Joe Show,The Young Turks...",Young Turks Caught Lying and Race Baiting.... ...,2018-11-17 00:00:00,273.0
999871,News & Politics,UCrUkx0UAxgybbbMvWphd62Q,2019-11-10 14:27:27.273595,Patriots I need your help growing the American...,0.0,ts__Orp310M,49,34.0,"American Joe,American Joe Show",President says he will send migrant Children B...,2018-11-15 00:00:00,353.0
999872,News & Politics,UCrUkx0UAxgybbbMvWphd62Q,2019-11-10 14:27:27.847348,Patriots I need your help growing the American...,1.0,bQ3_ZMVpiio,298,6.0,"American Joe,American Joe Show,Michael Avenatt...","Creepy Porn Lawyer, and Woman Beater Michael A...",2018-11-14 00:00:00,76.0
999873,News & Politics,UCrUkx0UAxgybbbMvWphd62Q,2019-11-10 14:27:28.400609,Patriots I need your help growing the American...,2.0,q92A939Nyj8,388,2.0,"American Joe,American Joe Show,Midterm Electio...",Midterm Fallout - How Bad is it For Trump?,2018-11-14 00:00:00,38.0


In [272]:
display(df_channels_news_pol)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
129,News & Politics,2006-08-26,UCttspZesZIDEwwpVIgoZtWQ,IndiaTV,15177282,139814,199.0,2.0870
133,News & Politics,2012-06-01,UCRWFSbif-RFENbBrSiez1DA,ABP NEWS,16274836,129027,207.0,2.0870
212,News & Politics,2017-03-03,UCmphdqZNmqL72WJ2uyiNw5w,ABP NEWS HINDI,10800000,51298,340.0,2.0870
268,News & Politics,2015-03-23,UCx8Z14PpntdaxCt2hakbQLQ,The Lallantop,9120000,9423,438.0,2.0870
337,News & Politics,2007-06-19,UCIvaYmXn910QMdemBG3v1pQ,Zee News,9280000,102648,549.0,2.0870
...,...,...,...,...,...,...,...,...
135820,News & Politics,2010-08-07,UC5rxiCGcNunIi5zI1hMYLMg,Salman Akhtar,10400,40,962468.0,53.1435
135825,News & Politics,2013-02-01,UCLSEJQ8TWtlEkaytaa4Y7lw,WingsOfChrist,10420,61,962547.0,53.1435
135901,News & Politics,2012-10-19,UCnkG_c5cyemVVsgCDoHiXew,The American Mirror,10500,329,963417.0,53.1435
136231,News & Politics,2017-11-25,UC69lWS7UMbBQc-9yqp4nGjA,Patriotism Show,10320,46,975448.0,53.1435


In [None]:
# check if all videos we found in news&pol are also pulished by a channel in category news&pol

df_videos_news_pol[np.logical_not(df_videos_news_pol.channel_id.isin(df_channels_news_pol.channel))]



Unnamed: 0,categories,channel_id,crawl_date,description,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count
1827,News & Politics,UCzWm1-4XF7AHxVUTkHCM1uw,2019-11-17 06:28:42.593675,retrogamer3.com,16.0,dfa8RRkKoa4,9251,25.0,"RetroGamer3,Live Stream,politics,Trump",Retrogamer3 Political Stream,2018-08-23 00:00:00,478.0
7605,News & Politics,UCzWLsxDD373D4tY8kN-0LGQ,2019-11-05 00:42:33.012228,What are the forces at work that have created ...,0.0,_dIIEMvH86k,309,9.0,"NWO,Ebola,Ukraine,Mainstream,Media,Pyschology",Adam Curtis describes the Surkow Strategy of M...,2015-01-04 00:00:00,865.0
18005,News & Politics,UCzVBu6oqlrAix0oq9T2rBFg,2019-11-19 20:40:22.403775,Social Media:\n\nFacebook.com/thebookoflaura\n...,89.0,eWXefhNB2po,707,625.0,"michael jackson,lyrics,music video,court,child...",my thoughts on the michael jackson documentary.,2019-04-24 00:00:00,12780.0
28840,News & Politics,UCzTmNzBxLEHbpZNOCpUTWbA,2019-11-03 04:38:01.617657,A young man is living a normal life with no ca...,16.0,ck6Yl8TNoWs,1257,452.0,"JoiRida,Cheatham,JoiRidaCheatham,Accepted,Detr...",Accepted - Award Winning Short Film,2013-10-13 00:00:00,27366.0
28860,News & Politics,UCzTmNzBxLEHbpZNOCpUTWbA,2019-11-03 04:38:06.565138,Short Film,1.0,tjUajxZAIZ8,422,15.0,"Joi.Rida,Cheatham,joiridacheatham,dread,loc,up...",JoiRida Twin Visit (Introducing Jive Viper),2010-03-04 00:00:00,987.0
...,...,...,...,...,...,...,...,...,...,...,...,...
970869,News & Politics,UCrXcatz6wlNHjuqgf-tglOA,2019-11-07 00:55:48.241832,"As promised, our Wet Head Challenge using the ...",3.0,lIuK9DGtOx8,321,141.0,"challenge,wet,head,gross,wet head challenge,we...",Gross Smoothie Wet Head Challenge 😕,2016-08-23 00:00:00,8941.0
991815,News & Politics,UCrVnMcE3GIyg2rM4gH34YWg,2019-11-10 10:02:03.075065,More Travel News...\nhttp://www.petergreenberg...,0.0,aSWbywb7SBE,423,1.0,"2008,Travel Inspiration,clinton,Presidential,P...",2008 Presidential Candidates Travel Scorecard,2008-01-25 00:00:00,588.0
998347,News & Politics,UCrV-WEtbXrRIkgWgbXLAvcQ,2019-10-31 15:06:30.119011,© 2012 WMG Webisode by Mutemath from The Blue...,3.0,H8-Al6B_J1g,106,17.0,"mutemath,wbr,INDMUSIC,warner bros records",Mutemath - What Happens Before The Show [Webis...,2006-11-04 00:00:00,3136.0
998349,News & Politics,UCrV-WEtbXrRIkgWgbXLAvcQ,2019-10-31 15:06:31.498209,© 2012 WMG Webisode by Mutemath from Park Wes...,2.0,wYI6dWaEHjk,56,8.0,"INDMUSIC,wbr,mutemath,warner bros records",Mutemath - Built for Destruction [Webisode],2006-11-04 00:00:00,1881.0


We see that indeed, not all videos in the news and politics category belong to a channel in this category!

A google search shows that apparently, you don't have to have the same category for all videos, but you set a "default" channel category which will be used for videos if you don't change it manually. Also, you can probably change the default category after a while if you want.

This is the reason why most of the news&pol videos are uploaded by a news&pol channe, but not all.

In the paper about the dataset, the authors say that the channel category is actually the "most frequent category", so I guess the video categories are the most relevant, as they are the true categories.

We could try to verify this, if we want to.

In [None]:
# filter news&pol videos to only include the channels we are interested in

df_videos_news_pol_manually_selected = df_videos_news_pol[df_videos_news_pol.channel_id.isin(["UCupvZG-5ko_eiXAupbDfxWw",  # CNN
                                                                                              "UCXIJgqnII2ZOINSWNOGFThA",  # Fox News
                                                                                              "UC16niRr50-MSBwiO3YDb3RA",  # BBC News
                                                                                              "UCaXkIU1QidjPwiAYu6GcHjg",  # MSNBC
                                                                                            ])]

In [280]:
df_channels[df_channels.name_cc=="MSNBC"]

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
2604,News & Politics,2011-12-01,UCaXkIU1QidjPwiAYu6GcHjg,MSNBC,1910000,23393,5600.0,2.444


In [235]:
df_videos_news_pol_manually_selected

Unnamed: 0,categories,channel_id,crawl_date,description,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count


In [273]:
# sort news&pol channels by subscriber count according to channel crawler
df_channels_news_pol_sort_subscribers = df_channels_news_pol.sort_values(by="subscribers_cc", ascending=False)

In [274]:
display(df_channels_news_pol_sort_subscribers)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
133,News & Politics,2012-06-01,UCRWFSbif-RFENbBrSiez1DA,ABP NEWS,16274836,129027,207.0,2.087
129,News & Politics,2006-08-26,UCttspZesZIDEwwpVIgoZtWQ,IndiaTV,15177282,139814,199.0,2.087
212,News & Politics,2017-03-03,UCmphdqZNmqL72WJ2uyiNw5w,ABP NEWS HINDI,10800000,51298,340.0,2.087
337,News & Politics,2007-06-19,UCIvaYmXn910QMdemBG3v1pQ,Zee News,9280000,102648,549.0,2.087
268,News & Politics,2015-03-23,UCx8Z14PpntdaxCt2hakbQLQ,The Lallantop,9120000,9423,438.0,2.087
...,...,...,...,...,...,...,...,...
130581,News & Politics,2008-03-26,UC65jmwvHLoCMBJXnzHrGwiA,The Virginian-Pilot,10200,3037,868834.0,14.148
134955,News & Politics,2009-05-17,UCuZTp4-0xPoGUxdKPLiQE6w,Harold Jackson,10133,1330,932456.0,23.492
133413,News & Politics,2016-03-25,UCWtAa1fyxYxR1gAH-IZtaXg,Trumpennials,10000,64,905989.0,14.780
133426,News & Politics,2018-08-31,UCaUCzPRX1bKUwz7a2aJgIDQ,Missing Persons & My...,10000,31,906002.0,14.699
