In [1]:
### IMPORTS ###

import pandas as pd                        # pandas for data analysis
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib.pyplot as plt            # matplotlib for data visualisation
import json
import zstandard as zstd

In [2]:
### PATHS ###

DIR = "../data/"

#read
READ_METADATA_PATH   = DIR + "original_metadata.jsonl.zst"
READ_TIMESERIES_PATH = DIR + "ent_timeseries_15p.tsv.zip"
READ_CHANNELS_PATH   = DIR + "ent_channels_15p.tsv.zip"

#write
WRITE_METADATA_PATH   = DIR + "ent_metadata_15.tsv.zip"
WRITE_TIMESERIES_PATH = DIR + "ent_timeseries_15p.tsv.zip"
WRITE_CHANNELS_PATH   = DIR + "ent_channels_15p.tsv.zip"

In [3]:
### READS ###

timeseries = pd.read_csv(WRITE_TIMESERIES_PATH)
channels   = pd.read_csv(WRITE_CHANNELS_PATH)

In [4]:
#### READ AND SPLIT INTO SMALLER CSV FILES ###

class zreader:

    def __init__(self, file, chunk_size=16384):
        self.fh = open(file, 'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''

    def readlines(self):
        while True:
            chunk = self.reader.read(self.chunk_size).decode("utf-8", errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]

reader = zreader(READ_METADATA_PATH)
metadata = []
df_metadata = pd.DataFrame([])

idx = 0
store_idx = 0
save_idx = 0

for line in reader.readlines():
    line_dict = json.loads(line)
    
    if (line_dict['channel_id'] in channels['channel'].values):
        
        del line_dict['crawl_date']
        del line_dict['categories']
        del line_dict['description']
        
        metadata.append(line_dict)

    idx += 1
    if idx%100000 == 0:
        print(idx)
    
    #store in a dataframe every 100.000 
    if len(metadata) >= 1e5:
        if store_idx < 9 : print(" - STORE", store_idx)
        df_metadata = pd.concat([df_metadata, pd.DataFrame(metadata)])
        metadata = []
        store_idx += 1
        
    #save dataframe every 1 million
    if len(df_metadata) >= 1e6:
        print(" - SAVE ", save_idx)
        
        S_METADATA_PATH = DIR + "metadata/_raw_yt_metadata" + str(save_idx) + ".tsv.zip"
    
        df_metadata.to_csv(S_METADATA_PATH, index=False, compression={'method':'zip'})
        df_metadata = pd.DataFrame([])
        store_idx = 0
        save_idx += 1
        
if store_idx < 9 : print(" - STORE", store_idx)
df_metadata = pd.concat([df_metadata, pd.DataFrame(metadata)])
metadata = []
store_idx += 1

print(" - SAVE ", save_idx)
        
S_METADATA_PATH = DIR + "metadata/_raw_yt_metadata" + str(save_idx) + ".tsv.zip"
        
df_metadata.to_csv(S_METADATA_PATH, index=False, compression={'method':'zip'})
df_metadata = pd.DataFrame([])
store_idx = 0
save_idx += 1

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
 - STORE 0
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
 - STORE 1
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
 - STORE 2
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
 - STORE 3
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000
7100000
 - STORE 4
7200000
7300000
7400000
7500000
7600000
7700000
7800000
7900000
8000000
8100000
8200000
8300000
8400000
8500000
8600000
8700000
8800000
8900000
9000000
9100000
9200000
9300000
 - STORE 5
9400000
9500000
 - STORE 6
9600000
9700000
9800000
9900000
10000000
10100000
10200000
10300000
10400000
10500000
10600000
10700000
10800000
10900000
 - STORE 7
11000000
11100000
11200000
11300000
114

In [None]:
# TREAT THE METDATA FILES
for i in range(save_idx):
    print("Start metadata ", i)
    
    # PATH
    METADATA_PATH = DIR + "metadata/_raw_yt_metadata" + str(i) + ".tsv.zip"
    ENT_METADATA_PATH = DIR + "metadata/ent_metadata" + str(i) + ".tsv.zip"
    print("Path done - ")
    
    # READ
    metadata = pd.read_csv(METADATA_PATH)
    print("Read done - ")
    
    # TREATMENT
    ent_metadata = metadata.rename(columns={'channel_id':'channel'})
    print("Treatment done - ")
    
    # WRITE
    ent_metadata.to_csv(ENT_METADATA_PATH, index=False, compression={'method':'zip'})
    print("Write done -")
    
    print("Done metadata ", i)

Start metadata  0
Path done - 
Read done - 
Treatment done - 
Write done -
Done metadata  0


In [None]:
### STORE ALL DATAFRAMES IN ONLY ONE ###

ent_metadata = pd.DataFrame()
for i in range(save_idx):
    print("Start metadata ", i)
    
    # PATH
    METADATA_PATH = DIR + "metadata/ent_metadata" + str(i) + ".tsv.zip"
    print("Path done - ")
    
    # READ
    metadata = pd.read_csv(METADATA_PATH)
    print("Read done - ")
    
    # CONCAT
    ent_metadata = pd.concat([ent_metadata, metadata], ignore_index=True)
    print("Concat done - ")


# WRITE
ent_metadata.to_csv(WRITE_METADATA_PATH, index=False, compression={'method':'zip'})
print("Write done -")

Start metadata  0
Path done - 
Read done - 
Concat done - 
Write done -


In [None]:
### KEEP COMMON CHANNELS BETWEEN ALL DATAFRAMES ###

#treatment
channel_ids = metadata[['channel']].drop_duplicates()
channels = pd.merge(channels, channel_ids)
timeseries = pd.merge(timeseries, channel_ids)
channel_ids = channels[['channel']].drop_duplicates()
ent_metadata = pd.merge(ent_metadata, channel_ids)

In [None]:
### WRITES ###

channels.to_csv(WRITE_CHANNELS_PATH, index=False, compression={'method':'zip'})
timeseries.to_csv(WRITE_TIMESERIES_PATH, index=False, compression={'method':'zip'})