In [1]:
# import used libraries
import pandas as pd                        # pandas for data analysis
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib.pyplot as plt            # matplotlib for data visualisation
import json
import zstandard as zstd

In [2]:
### PATHS ###

DIR = "data/"

#read
TIMESERIES_PATH = DIR + "df_timeseries_en.tsv.gz"
CHANNELS_PATH   = DIR + "df_channels_en.tsv.gz"

#write
S_TIMESERIES_PATH = DIR + "s_df_timeseries_en.tsv.zip"
S_CHANNELS_PATH   = DIR + "s_df_channels_en.tsv.zip"

In [3]:
### IMPORTS ###

timeseries = pd.read_csv(TIMESERIES_PATH, sep='\t')
channels   = pd.read_csv(CHANNELS_PATH, sep='\t')

FileNotFoundError: [Errno 2] No such file or directory: 'data/df_timeseries_en.tsv.gz'

In [None]:
### TREATMENT ###

#keep only channels that have gone from 10k to 500k subscribers in the period
channels_sub10k  = timeseries[timeseries['subs'] < 10e3]['channel'].drop_duplicates()
channels_sub500k = timeseries[timeseries['subs'] > 500e3]['channel'].drop_duplicates()
s_channels_ids   = pd.merge(channels_sub500k,channels_sub10k)

s_channels   = pd.merge(channels, s_channels_ids)
s_timeseries = pd.merge(timeseries, s_channels_ids).drop(['category'], axis=1)

In [None]:
### EXPORTS ###

s_channels.to_csv(S_CHANNELS_PATH, index=False, compression={'method':'zip'})
s_timeseries.to_csv(S_TIMESERIES_PATH, index=False, compression={'method':'zip'})

In [None]:
### PLOT CATEGORIES ###

categories = channels.groupby('category_cc').count()[['channel']]
categories['success_channel'] = s_channel.groupby('category_cc').count()[['channel']]
categories['success_rate'] = categories['success_channel']/categories['channel']

plt.bar(categories.index, categories['channel'])
plt.xticks(rotation=45, ha='right')
plt.title('Number of channels per categories on youtube')
plt.show()

plt.bar(categories.index, categories['success_channel'])
plt.xticks(rotation=45, ha='right')
plt.title('Number of successful channels per categories on youtube')
plt.show()

plt.bar(categories.index, categories['success_rate'])
plt.xticks(rotation=45, ha='right')
plt.title('Rate of successful channels in each categories')
plt.show()

In [None]:
##################################################################################################
############################################ METADATA ############################################
##################################################################################################

In [None]:

############################################## PATH ##############################################

METADATA_PATH   = DIR + "_raw_yt_metadata.jsonl.zst"

In [None]:

############################## READ AND SPLIT INTO SMALLER CSV FILES #############################

class zreader:

    def __init__(self, file, chunk_size=16384):
        self.fh = open(file, 'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''

    def readlines(self):
        while True:
            chunk = self.reader.read(self.chunk_size).decode("utf-8", errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]

reader = zreader(METADATA_PATH)
metadata = []
df_metadata = pd.DataFrame([])

idx = 0
store_idx = 0
save_idx = 0

min_upload_date, max_upload_date = pd.to_datetime("01-01-2015"), pd.to_datetime("09-30-2019")
for line in reader.readlines():
    line_dict = json.loads(line)
    
    if (pd.to_datetime(line_dict["upload_date"]) < max_upload_date) & \
       (pd.to_datetime(line_dict["upload_date"]) > min_upload_date) :
        
        del line_dict['description']
        del line_dict['crawl_date']
        del line_dict['categories']
        
        metadata.append(line_dict)
    idx += 1
    if idx%100000 == 0:
        print(idx)
    
    #store in a dataframe every 1 million
    if len(metadata) >= 1000000:
        if store_idx < 9 : print(" - STORE", store_idx)
        df_metadata = pd.concat([df_metadata, pd.DataFrame(metadata)])
        metadata = []
        store_idx += 1
        
        
    #save dataframe every 10 million
    if len(df_metadata) >= 10000000:
        print(" - SAVE ", save_idx)
        
        S_METADATA_PATH = DIR + "metadata/_raw_yt_metadata" + str(save_idx) + ".tsv.zip"
    
        df_metadata.to_csv(S_METADATA_PATH, index=False, compression={'method':'zip'})
        df_metadata = pd.DataFrame([])
        store_idx = 0
        save_idx += 1
        
print(" - SAVE ", save_idx)
        
S_METADATA_PATH = DIR + "metadata/_raw_yt_metadata" + str(save_idx) + ".tsv.zip"

df_metadata.to_csv(S_METADATA_PATH, index=False, compression={'method':'zip'})
df_metadata = pd.DataFrame([])
store_idx = 0
save_idx += 1

In [None]:
# TREAT THE 6 METADATA
for i in range(save_idx):
    print("Start metadata ", i)
    
    # PATH
    METADATA_PATH = DIR + "metadata/_raw_yt_metadata" + str(i) + ".tsv.zip"
    S_METADATA_PATH = DIR + "metadata/s_metadata" + str(i) + ".tsv.zip"
    print("Path done - ")
    
    # READ
    metadata = pd.read_csv(METADATA_PATH)
    print("Read done - ")
    
    # TREATMENT
    metadata = metadata.rename(columns={'channel_id':'channel'})
    s_metadata = pd.merge(metadata, s_channels_ids)
    print("Treatment done - ")
    
    # WRITE
    s_metadata.to_csv(S_METADATA_PATH, index=False, compression={'method':'zip'})
    print("Write done -")
    
    print("Done metadata ", i)

In [None]:
# STORE ALL DATAFRAMES IN ONLY ONE
s_metadata = pd.DataFrame()
for i in range(save_idx):
    print("Start metadata ", i)
    
    # PATH
    METADATA_PATH = DIR + "metadata/s_metadata" + str(i) + ".tsv.zip"
    print("Path done - ")
    
    # READ
    metadata = pd.read_csv(METADATA_PATH)
    print("Read done - ")
    
    # CONCAT
    s_metadata = pd.concat([s_metadata, metadata], ignore_index=True)
    print("Concat done - ")


# WRITE
S_METADATA_PATH = DIR + "s_df_metadata_en.tsv.zip"
s_metadata.to_csv(S_METADATA_PATH, index=False, compression={'method':'zip'})
print("Write done -")

In [None]:
# KEEP COMMON CHANNELS BETWEEN df_channels, df_metadata AND df_timeseries
# df_metadata HAS LESS CHANNELS THAN df_channels AND df_timeseries (SOME CHANNELS WITHOUT ANY VIDEO ?)

#dir
DIR = "data/"

#read path
TIMESERIES_PATH = DIR + "s_df_timeseries_en.tsv.zip"
CHANNELS_PATH   = DIR + "s_df_channels_en.tsv.zip"
METADATA_PATH   = DIR + "s_df_metadata_en.tsv.zip"

#imports
timeseries = pd.read_csv(TIMESERIES_PATH)
channels   = pd.read_csv(CHANNELS_PATH)
metadata   = pd.read_csv(METADATA_PATH)

#treatment
channel_ids = metadata[['channel']].drop_duplicates()
s_channels = pd.merge(channels, channel_ids)
s_timeseries = pd.merge(timeseries, channel_ids)

#write path
S_TIMESERIES_PATH = DIR + "s_df_timeseries_en.tsv.zip"
S_CHANNELS_PATH   = DIR + "s_df_channels_en.tsv.zip"

#exports
s_channels.to_csv(S_CHANNELS_PATH, index=False, compression={'method':'zip'})
s_timeseries.to_csv(S_TIMESERIES_PATH, index=False, compression={'method':'zip'})