In [1]:
import pandas as pd
import numpy as np
import sys

In [2]:
raw_comments_filename = 'raw_data/youtube_comments.tsv.gz'
raw_video_metadata_filename = 'raw_data/yt_metadata_en.jsonl.gz'

Since we are studying youtube users, we choose to start the data sampling from them.

What we do is recover a fixed number of users, let's say around 1 million on carry on our analysis on the video they commented on. We assume that with enough users, we will have a large enough sample of videos to be representative of all the videos.

Furthermore, we can take the first chunk of the comment file since the user are ordered in a random way. (**VERIFY THIS INFO, MAYBE IT IS ONLY IN CHRONOLOGICAL ORDER**)

In [9]:
chunksize = 12e6 + 1    # add one so we have a nice round number because we remove the last user 

# get one chunk of comments to recover num_users
for df_comments in pd.read_csv(raw_comments_filename, sep='\t', chunksize=chunksize, compression='infer'):
    
    # Since the last author could be cutoff, we remove the last author
    df_comments = df_comments[df_comments.author != df_comments.iloc[-1]['author']]

    print('Number of users: %d' % len(df_comments.author.unique()))        # should bebetween 500 000 and 1 million

    df_comments.to_feather('comments.feather')

    break

Number of users: 619701


In [10]:
df_comments.head()

Unnamed: 0,author,video_id,likes,replies
0,1,Gkb1QMHrGvA,2,0
1,1,CNtp0xqoods,0,0
2,1,249EEzQmVmQ,1,0
3,1,_U443T2K_Bs,0,0
4,1,rJbjhm0weYc,0,0


In [11]:
# Get unique video ids
video_ids = pd.DataFrame({'video_id': df_comments['video_id'].unique()})
print('Number of videos: %d' % len(video_ids))

Number of videos: 4136765


In [12]:
# clear memory before handling more data
print('Cleared %d bytes of memory' % sys.getsizeof(df_comments))
%xdel df_comments

Cleared 1199999816 bytes of memory


After gathering our user and identifying what videos they commented on, we recover metadata on these video for our analysis by computing the intersection chunkwize of the total video set and the videos our sample of user have commented on.

In [6]:
# They are around 73M videos in this file
chunksize = 500e3
max_num_chunk = 74e6 // chunksize

# Define dtypes to have to most memory optimization possible
# After some tests, using dtypes seems to be a little bit beneficial
# One chunk:  
#   - without dtypes:
#       time: 32.7s
#       space: 62574070 B
#   - with dtypes
#       time: 29.8s
#       space: 56003766 B

dtype = {
    'title': str,
    'display_id': str,
    'description': str,
    'categories': 'category',
    'crawl_date': str,
    'upload_date': str,
    'dislike_count': int,
    'like_count': int,
    'duration': int,
    'view_count': int,
    'tags': object,
}

dfs = []
chunk_num = 1
for chunk in pd.read_json(raw_video_metadata_filename, dtype=dtype, chunksize=chunksize, lines=True):

    # remove unused fields for space optimizations: title, description, crawldate
    chunk.drop(['title', 'description', 'crawl_date'], axis=1 , inplace=True)

    # keep only videos where one of our user in our user pool commented
    # then remove redudant column display_id
    chunk = pd.merge(chunk, video_ids, left_on='display_id', right_on='video_id').drop('display_id', axis=1)

    print('Chunk %d out of %d. Kept %d out of %d videos.' % (chunk_num, max_num_chunk, len(chunk), chunksize))

    dfs.append(chunk)

    chunk_num += 1
    

videos_metadata = pd.concat(dfs).reset_index(drop=True)

# Save videoset
videos_metadata.to_feather('video_metadata.feather')

Chunk 1 out of 148. Kept 34962 out of 500000 videos.
Chunk 2 out of 148. Kept 26111 out of 500000 videos.
Chunk 3 out of 148. Kept 33473 out of 500000 videos.
Chunk 4 out of 148. Kept 28317 out of 500000 videos.
Chunk 5 out of 148. Kept 21542 out of 500000 videos.
Chunk 6 out of 148. Kept 43322 out of 500000 videos.
Chunk 7 out of 148. Kept 29785 out of 500000 videos.
Chunk 8 out of 148. Kept 37594 out of 500000 videos.
Chunk 9 out of 148. Kept 24331 out of 500000 videos.
Chunk 10 out of 148. Kept 30089 out of 500000 videos.
Chunk 11 out of 148. Kept 25326 out of 500000 videos.
Chunk 12 out of 148. Kept 23741 out of 500000 videos.
Chunk 13 out of 148. Kept 21987 out of 500000 videos.
Chunk 14 out of 148. Kept 27249 out of 500000 videos.
Chunk 15 out of 148. Kept 30069 out of 500000 videos.
Chunk 16 out of 148. Kept 26012 out of 500000 videos.
Chunk 17 out of 148. Kept 30458 out of 500000 videos.
Chunk 18 out of 148. Kept 25333 out of 500000 videos.
Chunk 19 out of 148. Kept 31086 out o

In [7]:
videos_metadata.head()

Unnamed: 0,categories,channel_id,dislike_count,duration,like_count,tags,upload_date,view_count,video_id
0,Gaming,UCzWrhkg9eK5I8Bm3HfV-unA,11423.0,1762,29891.0,"lego city police for kids,lego polizi,lego mov...",2017-12-08 00:00:00,23152662.0,mp9gt45aHxY
1,Gaming,UCzWrhkg9eK5I8Bm3HfV-unA,1561.0,1146,6118.0,"lego dinosaurs,lego jurassic world,lego dinosa...",2017-12-07 00:00:00,4028426.0,hn2zYwqSINY
2,Gaming,UCzWrhkg9eK5I8Bm3HfV-unA,206.0,1706,2235.0,"lego marvel super heroes 2,lego super heroes 2...",2017-11-18 00:00:00,585746.0,UTZLSHaE4Sw
3,Gaming,UCzWrhkg9eK5I8Bm3HfV-unA,799.0,2146,2501.0,"lego dinosaurs,lego jurassic world,lego t-rex,...",2017-02-21 00:00:00,1547805.0,SWZG-ba1qDk
4,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2548.0,2196,8780.0,"lego batman movie,the batman movie,new batman ...",2017-02-13 00:00:00,6674760.0,3vQK78eUg2A
