# Process links in video descriptions

This notebook processes links in video descriptions. The goal is to extract links that may be related to sponsors.

In [6]:
import re
import os
import bs4
import requests
import pandas as pd
from tqdm import tqdm
import dask.dataframe as dd
from cachetools import cached, TTLCache

from dask.diagnostics import ProgressBar
ProgressBar().register()

# Setup a cache for the requests
cache = TTLCache(maxsize=1024, ttl=86400)

In [7]:
USE_SUBDATASETS = False

In [8]:
PATH = '../data/subdata/' if USE_SUBDATASETS else '../data/'

PATH_METADATA_SRC = PATH + 'yt_metadata_en.jsonl.gz'
PATH_SPONSORS_URLS_DST = '../data/generated/yt_sponsors_urls.csv'

## Extract links from video descriptions

Firstly, we extract links using regular expressions. We perform a first filtering to remove urls from sites that are generally not related to sponsors such as Youtube, Twitter, Facebook, Wikipedia, Discord, etc...

In [9]:
# Load the invalid URLs
PATH_INVALID_URLS = '../data/invalid_urls.csv'

invalid_urls_reg = []
with open(PATH_INVALID_URLS, 'r') as f:
    for line in f:
        invalid_urls_reg.append(fr"(?i)({line.strip()})")

In [10]:
def get_urls(description):
    url_regex = r"(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
    urls = []
    for line in description.split("\n"):
        if re.search(url_regex, line):
            urlss = [x.group() for x in re.finditer(url_regex, line)] # Find all urls in the line
            urlss = [x for x in urlss if not any([re.search(reg, x) for reg in invalid_urls_reg])] # Filter out invalid urls
            urls.extend(urlss)
    urls = set(urls) # Remove duplicates
    return urls

# Test the function on basic examples
assert(not get_urls('There is no link.'))
assert(get_urls('This is a link: www.special.com/') == {'www.special.com/'})
assert(get_urls('This is a link: https://www.special.com/ and this is another link: https://www.youtube.com/watch?v=2') == {'https://www.special.com/'})
assert(get_urls('This is a link: https://www.special.com/ \n and this is another link: www.special.com/') == {'www.special.com/', 'https://www.special.com/'})

In [11]:
# Delete the destination file if it already exists
if os.path.exists(PATH_SPONSORS_URLS_DST):
    os.remove(PATH_SPONSORS_URLS_DST)

for df_metadata in tqdm(pd.read_json(PATH_METADATA_SRC, compression="infer", chunksize=100000, lines=True)):
    # Get urls and count of urls in the description
    df_metadata['urls'] = df_metadata['description'].apply(get_urls)
    df_metadata['urls_cnt'] = df_metadata['urls'].apply(lambda x: len(x))

    # Keep the sponsorized videos and their urls
    sponsors = df_metadata[df_metadata['urls_cnt'] > 0][['display_id', 'urls', 'urls_cnt']]
    
    # Append to the file
    sponsors.to_csv(PATH_SPONSORS_URLS_DST, mode='a', header=False, index=False)

438it [2:50:35, 20.69s/it]

## Resolve Bitly links (TODO)

We resolve Bitly links to get the original url. This is done using the Bitly API.

In [None]:
BITLY_API_KEY = '6eb2a9c9ec5950c276bf91b89ef2b1f229408807'

In [None]:
sponsors.head()

In [None]:
@cached(cache)
def resolve_app_adjust_url(url, debug_mode=False):
    if 'app.adjust.com' not in url:
        return url
    
    if debug_mode: print(f'Resolving app adjust url: {url}')
    try:
        response = requests.get(url)
        
        if response.status_code == 200:
            soup = bs4.BeautifulSoup(response.text, 'html.parser')
            new_url = soup.find('a', class_='product-header__title app-header__title')
            if debug_mode: print(f'Resolved adjust.com url: {new_url}')
            return new_url if new_url is not None else url
        else:
            if debug_mode: print(f'Could not resolve adjust.com url: {url}')
            return url

    except Exception as e:
        print(f'Could not resolve url {url}: {e}')
        return url

In [None]:
@cached(cache)
def resolve_bit_ly_url(url, debug_mode=False):
    if 'bit.ly' not in url:
        return url

    # Remove http:// or https:// from the url
    url = url.replace('http://', '').replace('https://', '')

    if debug_mode: print(f'Resolving bit.ly url {url}...')
    try:
        # Post a request via the bit.ly API
        response = requests.post('https://api-ssl.bitly.com/v4/expand', headers={'Authorization': f'Bearer {BITLY_API_KEY}'}, json={'bitlink_id': url})

        # Retrieve the long url from the response
        if response.status_code == 200:
            new_url = response.json()['long_url']
            if debug_mode: print(f'\tResolved bit.ly url: {url} -> {new_url}')

            #if re.search(r"app.adjust.com", new_url):
            #    return resolve_app_adjust_url(new_url)
                    
            return new_url
        else:
            if debug_mode: print(f'\tCould not resolve bit.ly url: {url}, status code: {response.status_code}')
            return url

    except Exception as e:
        print(f'Could not resolve url {url}: {e}')
        return url