# Characterizing Patronage on YouTube

## 0. Files and brief explanation of those

All data is located in `/dlabdata1/youtube_large/`

**YouNiverse dataset:**

- `df_channels_en.tsv.gz`: channel metadata.
- `df_timeseries_en.tsv.gz`: channel-level time-series.
- `yt_metadata_en.jsonl.gz`: raw video metadata.
- `youtube_comments.tsv.gz`: user-comment matrices.
- `youtube_comments.ndjson.zst`: raw comments â€” this is a HUGE file.

**Graphteon dataset:**
- `creators.csv` list with all creator names.
- `final_processed_file.jsonl.gz` all graphteon time-series.
- `pages.zip` raw html of the pages in graphteon.

In [None]:
# !conda list

In [None]:
import os 
import io
import pandas as pd
import json
import re
import tqdm
import zstandard
from tqdm import tqdm

In [None]:
DATA_FOLDER = "/dlabdata1/youtube_large/"

In [None]:
# list all files in DATA_FOLDER
# !ls -lh /dlabdata1/youtube_large

## 1. Load data

### 1.1. YouNiverse dataset

#### 1.1.1 Channel metadata

In [None]:
# !ls -lh /dlabdata1/youtube_large/df_channels_en.tsv.gz

In [None]:
# channel metadata
df_yt_channels = pd.read_csv(DATA_FOLDER+'df_channels_en.tsv.gz', sep="\t", compression='gzip')
df_yt_channels.head()

#### 1.1.2 Youtube channel-level time-series

In [None]:
# !ls -lh /dlabdata1/youtube_large/df_timeseries_en.tsv.gz

In [None]:
# channel-level time-series.
df_yt_timeseries = pd.read_csv(DATA_FOLDER+'df_timeseries_en.tsv.gz', sep="\t", compression='gzip', nrows=100)
df_yt_timeseries.head()

#### 1.1.3 Raw video metadata

In [None]:
!ls -lh /dlabdata1/youtube_large/yt_metadata_en.jsonl.gz

In [None]:
# ! zcat /dlabdata1/youtube_large/yt_metadata_en.jsonl.gz | head

In [None]:
df_yt_metadata = pd.read_json(DATA_FOLDER+'yt_metadata_en.jsonl.gz', compression='gzip', lines=True, nrows=100)
df_yt_metadata.head(2)

#### 1.1.4 user-comment matrices

In [None]:
# !ls -lh /dlabdata1/youtube_large/youtube_comments.tsv.gz

In [None]:
# user-comment matrices
df_yt_comments = pd.read_csv(DATA_FOLDER+'youtube_comments.tsv.gz', sep="\t", compression='gzip', nrows=100)
df_yt_comments.head()

#### 1.1.5 raw comments

In [None]:
# !ls -lh /dlabdata1/youtube_large/youtube_comments.ndjson.zst

In [None]:
def line_jsonify(line): 
    """

    :param line: string to parse and jsonify
    :return: 
    """    
    
    # add square brackets around line
    line = "[" + line + "]"

    # remove quotes before and after square brackets   
    line = line.replace("\"[{", "[{")
    line = line.replace("}]\"", "}]")    
    
    # replace double double-quotes with single double-quotes
    line = line.replace("{\"\"", "{\"")
    line = line.replace("\"\"}", "\"}")
    line = line.replace("\"\":\"\"", "\":\"")
    line = line.replace(":\"\"", ":\"")
    line = line.replace("\"\":", "\":")
    
    # line = line.replace("\"\":", "\":")
    line = line.replace("\"\",\"\"", "\",\"")
    line = line.replace("\"\",\"\"", "\",\"")
    line = line.replace("\\\"\"", "\\\"")
    line = line.replace("\\\",[", "\\\\ \",[")
    
    line = re.sub(r',\"\"(?!\,)', ',\"', line)

    line = line.replace("true,\"\"", "true,\"")
    line = line.replace("false,\"\"", "false,\"")
    
    return line

In [None]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        import codecs
        self.fh = open(file,'rb')
        print(f"reading {file} in chunks ...")
        self.chunk_size = chunk_size
        self.dctx = zstandard.ZstdDecompressor(max_window_size=2147483648)
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''

    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        nb_chunk = 0
        while True:
            nb_chunk = nb_chunk + 1
            if nb_chunk % 5000 == 0:
                print("number of chunks read: ", nb_chunk)
                
            chunk = self.reader.read(self.chunk_size).decode("utf-8", "replace")

            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            # print("lines per chunk: ", len(lines))
            # print(lines)
            
            for line in lines[:-1]:
                # print(line)
                yield line

            self.buffer = lines[-1]

In [None]:
NB_OF_LINES = 350000
lines_json = []
inp_file = DATA_FOLDER+"youtube_comments.ndjson.zst"
reader = Zreader(inp_file, chunk_size=4092)

for i, line in enumerate(reader.readlines()):
    if i > NB_OF_LINES:
        # print(line)
        break
    line_json = json.loads(line_jsonify(line))
    lines_json.append(line_json)

print("==> number of lines read:", len(lines_json))

df_yt_comments_raw = pd.DataFrame(data=lines_json[1:], columns=lines_json[0])
df_yt_comments_raw.head()

### 1.2. Graphtreon dataset

#### 1.2.1 List with all creator names.

In [None]:
# !ls -lh /dlabdata1/youtube_large/creators.csv

In [None]:
# list with all creator names.
df_gt_creators = pd.read_csv(DATA_FOLDER+'creators.csv')
df_gt_creators.head()

#### 1.2.2 All graphtreon time-series

In [None]:
# !ls -lh /dlabdata1/youtube_large/final_processed_file.jsonl.gz

In [None]:
# final_processed_file.jsonl.gz all graphteon time-series.
df_gt_timeseries = pd.read_json(DATA_FOLDER+'final_processed_file.jsonl.gz', compression='gzip', lines=True, nrows=100)
df_gt_timeseries.head()

#### 1.2.3 Raw html of the pages in graphteon.

In [None]:
!ls -lh /dlabdata1/youtube_large/pages.zip

In [None]:
# pages.zip raw html of the pages in graphteon.

## 2. Merge data

### 2.1. Merge channels data with YouTube timeseries

In [None]:
df_yt_timeseries_merged = df_yt_timeseries.merge(df_yt_channels)
df_yt_timeseries_merged.head()

### 2.2. Merge YouTube timeseries and Graphtreon timeseries