In [43]:
# importing libraries
import gzip
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import scipy.stats as stats

# Preprocessing data

## Relevant datasets
The datasets that we will work with for our project are: 
- `yt_metadata_en.jsonl.gz`
- `df_channels_en.tsv.gz`
- `df_timeseries_en.csv.gz`
- `youtube_comments.tsv.gz`

## Reading Youtube video metadata 
As we are primarily interested in the collaborations between youtube channels, we will first start with processing the data within the `yt_metadata_en.jsonl.gz` dataset to identify collaborations between youtube channels. 

In [44]:
### import the datasets
data_path = './data/yt_metadata_en.jsonl.gz'
channels_path = './data/df_channels_en.tsv'

### Preparation: data will be loaded and processed in chunks

# size of each chunks
chunk_size = 100000
# limit to number of chunks read
chunks_to_read = 3

# initialise list to store filtered chunks of data
filtered_chunk_list = []

# define category and time to filter according to
filter_category = 'Gaming'
filter_date = '2016-01-01'

# opening .gzip compressed file
with gzip.open(data_path, 'rt', encoding='utf-8') as file:
    # iterate through number of chunks in the file to read each chunk
    for i, chunk in enumerate(pd.read_json(file, lines=True, chunksize=chunk_size, convert_dates=['upload_date'])):
        # only read chunks_to_read many chunks
        if i < chunks_to_read:
            # filter for videos according to category and date conditions defined previously
            filtered_chunk = chunk[
                (chunk['categories'] == filter_category) & 
                (chunk['upload_date'] > pd.Timestamp(filter_date))
            ]
            filtered_chunk_list.append(filtered_chunk)
        else:
            break  # exit loop once enough chunks are read

# concatenate all filtered dataframes into a single dataframe
final_df = pd.concat(filtered_chunk_list, ignore_index=True)

# final_df contains data of first chunks_to_read number of chunks of data filtered according to filter_category and posted after filter_date
#display(final_df)

## Preprocessing video descriptions

With the filtered dataframes, we can search for indications of collaboration between youtube channels by identifying channel links in the video descriptions. 

According to the [official youtube website](https://support.google.com/youtube/answer/6180214?hl=en), youtube channel urls can take the following 4 forms: 
1. Channel url: youtube.com/channel/unique_channel_id
2. Handle url: youtube.com/@youtube_channel
3. Custom url: youtube.com/c/youtube_channel
4. Legacy username url: youtube.com/user/youtube_channel_username

Thus, we will search video descriptions for links with any format from above. Formats 1 and 4 contains the channel ID as part of the link while formats 2 and 3 contain channel names. 

Channel names found will be appended to an additional column while channel IDs found will be appended to a separate column in the dataframe. If more than one channel is mentioned, all channels will be appeneded. 


In [68]:
### Identifying channel collaboration: scraping youtube channel links from the description

data = final_df.copy()

# function to extract all youtube channel IDs and channel names (4.4 seconds to run)
def extract_youtube_channels(text):
    
    # regex for finding YouTube channel URLs
    channel_id_pattern = r'https?://www\.youtube\.com/(?:channel/|user/)([\w-]+)'
    channel_name_pattern = r'https?://www\.youtube\.com/(?:@|c/)([\w-]+)'

    # find all links that match the patterns
    channel_ids = re.findall(channel_id_pattern, text)
    channel_names = re.findall(channel_name_pattern, text)

    # # join the IDs and names found (or None if not found)
    channel_id = channel_ids if channel_ids else None
    channel_name = channel_names if channel_names else None

    return channel_id, channel_name

# apply the function and create two new columns
data[['mentioned_channel_ID', 'mentioned_channel_name']] = data['description'].apply(
    lambda x: pd.Series(extract_youtube_channels(x))
)

In [70]:
data[['channel_id' , 'description', 'mentioned_channel_ID', 'mentioned_channel_name']].dropna().head(20)

Unnamed: 0,channel_id,description,mentioned_channel_ID,mentioned_channel_name
2442,UCzV74pBOfwtAMuOm-Jg8gqg,Welcome to United UHC Season 6! It’s back afte...,"[BrickPlays, xNestorio, BiboyQG, theflyingbows...","[flanke, pikachuplays, RKYmc]"
2443,UCzV74pBOfwtAMuOm-Jg8gqg,Welcome to United UHC Season 6! It’s back afte...,"[BrickPlays, xNestorio, BiboyQG, theflyingbows...","[flanke, pikachuplays, RKYmc]"
2444,UCzV74pBOfwtAMuOm-Jg8gqg,Welcome to United UHC Season 6! It’s back afte...,"[BrickPlays, xNestorio, BiboyQG, theflyingbows...","[flanke, pikachuplays, RKYmc]"
2483,UCzV74pBOfwtAMuOm-Jg8gqg,"In today's video, we're back on Hypixel for so...","[Grapeapplesauce, PrivateFearless, StrauberryJam]",[canadiancraft]
2485,UCzV74pBOfwtAMuOm-Jg8gqg,"In today's video, we're back on Hypixel for so...","[Grapeapplesauce, PrivateFearless, StrauberryJam]",[canadiancraft]
2545,UCzV74pBOfwtAMuOm-Jg8gqg,Hello everyone and welcome to United UHC Seaso...,"[xNestorio, BiboyQG, UCABf02qOye7XYapcK1M45LQ,...","[shutupbrick, Spifey]"
2547,UCzV74pBOfwtAMuOm-Jg8gqg,Hello everyone and welcome to United UHC Seaso...,"[xNestorio, BiboyQG, UCABf02qOye7XYapcK1M45LQ,...","[shutupbrick, Spifey]"
2549,UCzV74pBOfwtAMuOm-Jg8gqg,Hello everyone and welcome to United UHC Seaso...,"[xNestorio, BiboyQG, UCABf02qOye7XYapcK1M45LQ,...","[shutupbrick, Spifey]"
2550,UCzV74pBOfwtAMuOm-Jg8gqg,Hello everyone and welcome to United UHC Seaso...,"[xNestorio, BiboyQG, UCABf02qOye7XYapcK1M45LQ,...","[shutupbrick, Spifey]"
2628,UCzV74pBOfwtAMuOm-Jg8gqg,Hello everyone and welcome to United UHC Seaso...,[dedreviil],[cudss]


In [71]:
### Checking for number of videos without a channel id or without a name mentioned in it's description

data[['channel_id' , 'description', 'mentioned_channel_ID', 'mentioned_channel_name']].isnull().sum() # channel_id and description are included as a sanity check

channel_id                    0
description                   0
mentioned_channel_ID      34245
mentioned_channel_name    41839
dtype: int64

In [72]:
# number of videos without a channel name AND without id mentioned
data[(data['mentioned_channel_ID'].isnull()) & (data['mentioned_channel_name'].isnull())]['title'].count()

32264

In [73]:
# number of videos(rows) in current dataframe
data.shape[0]

44132

## Consolidating youtube metadata

After extracting channels mentioned in the description of each video, video description and title are no longer relevant to our analysis. Thus, the `'description'` and `'title'` columns will be dropped. 

In [74]:
# drop irrelevant columns
data.drop(['description' , 'title'] , axis = 1, inplace= True) # tags are temporarily kept for additional category information

## Constructing Video Collaboration dataset

From the dataset containing the channel IDs/names of channels mentioned in each video's description, we will construct a dataset that contains videos with channel collaboration (represented by the channel id of the collaborating channel). 

To do so, we need the new dataset to satisfy 2 conditions
1. Channel ID/name listed as the collaborating channel must be different from channel ID/name of video creator. (A channel cannot collaborate with itself)
2. Channel ID should not be empty 

Channel names identified in the description in column `'mentioned_channel_name'` (without corresponding channel ids) will be matched to its corresponding channel ID in the `channel_ids_and_names` dataset (constructed from the given `df_channels_en.tsv.gz` dataset). (Missing values will be discussed in a later section)

We chose to keep channel IDs as the identifier for channels because the dataset `df_timeseries_en.csv.gz` that we will subsequently use is based on channel ID.  

In [75]:
# read channels dataset
df_channels = pd.read_csv(channels_path, sep ='\t')

### identify columns to drop
df_channels.columns

# keep and rename channel id and channel name for readability
channel_ids_and_names = df_channels[['channel' , 'name_cc']].copy()
channel_ids_and_names.rename(columns={"channel": "channel_id", "name_cc": "channel_name"} , inplace= True)

In [76]:
## process channel_ids_and_names dataset for better efficiency 
unique_channel_ids = channel_ids_and_names.channel_id.unique()
# convert to a set for faster loop
unique_ids_set = set(unique_channel_ids)
# create a dictionary for faster lookup
channel_name_to_id = pd.Series(channel_ids_and_names.channel_id.values, index=channel_ids_and_names.channel_name.str.lower()).to_dict()

### Missing collaboration data

As we are only able to work on channel IDs that exist in the 

In [97]:
# load youtube timeseries data
timeseries_path = './data/df_timeseries.tsv'

df_timeseries = pd.read_csv(timeseries_path, sep ='\t')

timeseries_channels_and_ids = set(df_timeseries['channel'])

missing = []
for id in unique_ids_set: 
    if id not in timeseries_channels_and_ids: 
        missing.append(id)

display(len(missing))
display(len(timeseries_channels_and_ids))

missing


2954

153550

['UCNv8pE-nHTAAp77nXiAB9AA',
 'UCQF_5AVYaUcnRFJ_UmMgKTw',
 'UCrOsk5nMMB21_9f6YkHsVnw',
 'UCIV98HBmHcD75nlIflN-pBA',
 'UCnWhLFvHOCXs3Zzf5Tl-j1w',
 'UCHPK_NE_YjMQYuiHJOvclxA',
 'UCyFEbj0vRX7ee40sP5AEQ_Q',
 'UCbxyh96etyXVSAoDfs6KCrg',
 'UCRMm8HgS0cjsvi87BBxW5OA',
 'UC1CUhQQjVS13V6cNKJyePDg',
 'UCZNFPjN3ELUM4EkNPhLM1Pg',
 'UC6SIzgo0y7WEqk6xHE9lXgA',
 'UC4YX0qedpTItmlRcD1WCb6g',
 'UC6eyKv82B3XEvAccxVITpFg',
 'UCBUZoe2PYlxOTUmJlwfdqjw',
 'UCFF6_zrxVZcdRPdd_KeAa9A',
 'UC5m2A6iRZHTK7fPFc_VlVVg',
 'UCwsRWmIL5XKqFtdytBfeX0g',
 'UCtdwNey4LZl_BNkzScVgp2w',
 'UCgv1lXQsIDC8vwo47ov_Svg',
 'UCU2ADcnhJ-_X-nP4VhsEQug',
 'UCOmGCUH-iJeunp4ZKugoNmg',
 'UC0QdW-H7_l0zh_CoNhlwoBw',
 'UCAbEBpzn83lFC7REoEdKZTA',
 'UCSZ05t6CcC8sbvPzsoxXcPA',
 'UCDxY7yP73nhobjLiUOm8how',
 'UCOdq4xrTO0mVVAvLL92BcTQ',
 'UCMEZFwfDZ0f4rWo1Sapa64Q',
 'UCWru2VNjmRKKS5n4x6fKxQQ',
 'UCc8SBuV_eV6tkPcYeTUvh0Q',
 'UCSuCGtC11rnbrrKJwGKFCjQ',
 'UCdDDy9gVPy-PHtTEbdFv_nw',
 'UC3tF8DWtB50nQPRLm0H5hIg',
 'UCDKeGgxhX5FkBb7VkTKWlRQ',
 'UCGIGnJQn4EP

In [98]:
missing = []
for id in timeseries_channels_and_ids: 
    if id not in unique_ids_set: 
        missing.append(id)

display(len(missing))
display(len(timeseries_channels_and_ids))

missing

20034

153550

['UCHc8g6lkDJP_9c-M62H--NA',
 'UCHxzqr0KCn6eTZphFXNLJOw',
 'UCjSmS2o58FB1D8MFQvCoQlA',
 'UC-8k5HnsuygWmS6JNixneyw',
 'UCTnkwivZEY1zcMuZURkBejA',
 'UC3J4Q1grz46bdJ7NJLd4DGw',
 'UChh_-u0cSfXC591tv-FaILg',
 'UCsaYTBSz6arpKK86yVCFBRg',
 'UCNd_qzD8PscXFOg34-fNEvA',
 'UC_r9ljm7KNUWllyBj2i9DRQ',
 'UCuzN6cBCw4nu00eu-E1j9Sw',
 'UC7n-IAZYx9RMYlBSQVWmQ9Q',
 'UCVweGcPlJAes8O9QDUbprzg',
 'UCkIXPYb4giFc5v5Jmlgx_Og',
 'UC7gX4DlceCyRoPIPq3VcikA',
 'UCPe20kE4po_Llzm6A9FSygg',
 'UCLv7O2H5fkDW_1fwADs7qXg',
 'UCUDjK7Bi90oOL29IzotiAnQ',
 'UCOU-CW3JPg_mHbggOdWLM7g',
 'UC7_ZuvcqVBLEeWI8bJhpJ_w',
 'UCGPQfi49R5iu0KVgn8RJZTQ',
 'UCxwwaGV0z2bgOanza6m23EQ',
 'UCVNErcOrGeS5YT6ZWq0l1rA',
 'UCnDIscQ8ztUqxmcNZ8VqLJw',
 'UCuQqrXbEJiJg7R2UxsAArJw',
 'UCv5W05ReW4BWbcg1TxOKRdw',
 'UCS2-kZPs6TsBzhhOZrXF81Q',
 'UCwXQr_iHtgzlCoawktySu2w',
 'UCy2y4OLrVEo8gxFds8UwtAw',
 'UCcSpNl-Ff-ktEy-aLP1_NxA',
 'UCxYkZNi6G2EJqwCZTdnrxlA',
 'UC-khv-3jEhk6DN4KVglHYkA',
 'UCf-Szc6YLd8rMnMXGThznFg',
 'UC6WjfF5643almqtPXA-5dqg',
 'UCxomiE1Zu5g

In [None]:
### Identifying missing channel names 

In [77]:
# function to map channel name to channel ID

def get_channel_ids_from_names(mentioned_named, channel_name_to_id):
    def process_entry(names):
        # Handle missing values
        if not names:
            return None
        # Lookup each name in the channel_name_to_id dictionary
        valid_ids = [channel_name_to_id.get(name.strip().lower()) for name in names]

        ## following statements strip out channels with missing IDs 
        # (remove when sure about mapping without filtering out missing channel ids)
        # valid_ids = [channel_name_to_id.get(name.strip().lower()) for name in names if name.strip().lower() in channel_name_to_id]
        # Filter out None values in case a name wasn't found in the dictionary
        # valid_ids = [id for id in valid_ids if id is not None]  ## problematic for processing missing data
        return valid_ids if valid_ids else None

    # list comprehension instead of apply for potentially better performance
    # returns a list of channel IDs for channels names mentioned
    return [process_entry(entry) for entry in mentioned_named]

In [78]:
## when a youtube channel mentions its own channel in the video description, it is not a collaboration. Thus, all such instances will be dropped.

# function to filter out all self-mentioning videos according to channel ID
def filter_ids_by_not_self_mentioning(df, unique_ids_set):
    # Define a vectorized function
    def process_entry(mentioned_ids, channel_id):
        if not mentioned_ids:
            return None
        ids = [id.strip() for id in mentioned_ids]
        valid_ids = [id for id in ids if id != channel_id]
        return valid_ids if valid_ids else None
        # do we need to keep converting channel ids between list and string or can we just keep it as a list

    # Split, process, and rejoin without using apply row-wise
    filtered = [process_entry(m_id, c_id) for m_id, c_id in zip(df['mentioned_channel_ID'], df['channel_id'])]
    
    return filtered

In [79]:
# map channel names to channel ids 
data['ids_from_names'] = get_channel_ids_from_names(data['mentioned_channel_name'], channel_name_to_id)

# filter all self-mentioning channels
data['filtered_mentioned_channel_ID'] = filter_ids_by_not_self_mentioning(data[['mentioned_channel_ID' ,
                                                                                                   'channel_id']], unique_ids_set)

# filter videos with only self-mentioning channels linked in its description
data[['channel_id' , 'filtered_mentioned_channel_ID' ]].dropna().head(20)

# sanity check: size of filtered non-self-mentioning channel id 
data.filtered_mentioned_channel_ID.dropna().shape[0]

8939

In [57]:
# Step 2: Remove redundant IDs
# a) Ensure ids_from_names are not the same as the channel_id
# b) Ensure ids_from_names do not appear in filtered_mentioned_channel_ID

def remove_redundant_ids(row):
    # Split the ids into a list, remove any whitespace, and filter out any empty strings
    valid_ids = list(filter(None, [x.strip() for x in str(row['ids_from_names']).split(',')]))
    mentioned_ids = set(filter(None, [x.strip() for x in str(row['filtered_mentioned_channel_ID']).split(',')]))
    
    # Remove the channel_id from the valid_ids if it's present
    valid_ids = [id for id in valid_ids if id != row['channel_id']]
    
    # Remove any id from valid_ids if it's already present in mentioned_ids
    valid_ids = [id for id in valid_ids if id not in mentioned_ids]
    
    # Join the valid ids back into a string
    return valid_ids if valid_ids else pd.NA

# Apply the function to each row of the dataframe
data['clean_ids_from_names'] = data.apply(remove_redundant_ids, axis=1)


In [58]:
# sanity check: size of filtered channel id extracted from names
len(data['ids_from_names'].dropna())

2293

In [59]:
data[['channel_id' , 'filtered_mentioned_channel_ID' , 'mentioned_channel_name' , 'ids_from_names' ]].dropna()

Unnamed: 0,channel_id,filtered_mentioned_channel_ID,mentioned_channel_name,ids_from_names
2442,UCzV74pBOfwtAMuOm-Jg8gqg,"[UC-vlWJoHuVKm51TEfPDik8A, UC_QHwXH3BX4M2qxnGq...","[flanke, pikachuplays, RKYmc]","[UCJs2lYkiApLerCx1VqZyi7Q, None, None]"
2443,UCzV74pBOfwtAMuOm-Jg8gqg,"[UC-vlWJoHuVKm51TEfPDik8A, UC_QHwXH3BX4M2qxnGq...","[flanke, pikachuplays, RKYmc]","[UCJs2lYkiApLerCx1VqZyi7Q, None, None]"
2444,UCzV74pBOfwtAMuOm-Jg8gqg,"[UC-vlWJoHuVKm51TEfPDik8A, UC_QHwXH3BX4M2qxnGq...","[flanke, pikachuplays, RKYmc]","[UCJs2lYkiApLerCx1VqZyi7Q, None, None]"
2545,UCzV74pBOfwtAMuOm-Jg8gqg,"[UCABf02qOye7XYapcK1M45LQ, UCeFfuIChD6ZQt1OPwM...","[shutupbrick, Spifey]","[UCjOUBkgXwhurLFYfEJAfVKQ, UCaawTX2yTGhd3xKwKf..."
2547,UCzV74pBOfwtAMuOm-Jg8gqg,"[UCABf02qOye7XYapcK1M45LQ, UCeFfuIChD6ZQt1OPwM...","[shutupbrick, Spifey]","[UCjOUBkgXwhurLFYfEJAfVKQ, UCaawTX2yTGhd3xKwKf..."
...,...,...,...,...
43643,UCzGImHRvnk3TIQtjZ7THiig,"[UCAqWWvxaTbWtoNDqny5u3fg, UCLotxgVldHHvGcUXEW...",[GManGaming],[None]
43645,UCzGImHRvnk3TIQtjZ7THiig,[UCNLgp2Ewml3Gp5fT4uh3raw],[GManGaming],[None]
43657,UCzGImHRvnk3TIQtjZ7THiig,[UCimaxNWaP51jV_Rzc976ohQ],[GManGaming],[None]
43659,UCzGImHRvnk3TIQtjZ7THiig,[UCPdvp_45Koo6WDCvFK3G4Dw],[GManGaming],[None]


## Missing collaboration data