# Process links in video descriptions

This notebook processes links in video descriptions. The goal is to extract links that may be related to sponsors.

In [30]:
import re
import requests
from cachetools import cached, TTLCache
from pyspark.sql.functions import col, udf, explode

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# Setup a cache for the requests
cache = TTLCache(maxsize=1024, ttl=86400)

In [2]:
USE_SUBDATASETS = True

PATH_METADATAS_SRC = '../data/subdata/yt_metadata_en_sub' if USE_SUBDATASETS else '../data/yt_metadata_en.tsv.gz'

## Extract links from video descriptions

Firstly, we extract links using regular expressions. We perform a first filtering to remove urls from sites that are generally not related to sponsors such as Youtube, Twitter, Facebook, Wikipedia, Discord, etc...

In [3]:
PATH_METADATAS_URLS_DST = '../data/generated/yt_metadata_en_urls.parquet'
PATH_METADATAS_NO_URLS_DST = '../data/generated/yt_metadata_en_no_urls.parquet'

In [4]:
# Load the invalid URLs
PATH_INVALID_URLS = '../data/invalid_urls.csv'

invalid_urls_reg = []
with open(PATH_INVALID_URLS, 'r') as f:
    for line in f:
        invalid_urls_reg.append(fr"(?i)({line.strip()})")

In [5]:
def get_urls(description):
    url_regex = r"(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
    urls = []
    for line in description.split("\n"):
        if re.search(url_regex, line):
            urlss = [x.group() for x in re.finditer(url_regex, line)] # Find all urls in the line
            urlss = [x for x in urlss if not any([re.search(reg, x) for reg in invalid_urls_reg])] # Filter out invalid urls
            urls.extend(urlss)
    urls = set(urls) # Remove duplicates
    return urls

# Test the function on basic examples
assert(not get_urls('There is no link.'))
assert(get_urls('This is a link: www.special.com/') == {'www.special.com/'})
assert(get_urls('This is a link: https://www.special.com/ and this is another link: https://www.youtube.com/watch?v=2') == {'https://www.special.com/'})
assert(get_urls('This is a link: https://www.special.com/ \n and this is another link: www.special.com/') == {'www.special.com/', 'https://www.special.com/'})

### Write sponsored videos

In [17]:
metadatas = spark.read.json(PATH_METADATAS_SRC)

# Get urls and count of urls in the description
get_urls_udf = udf(lambda x: get_urls(x))
metadatas = metadatas.withColumn('urls', get_urls_udf(col('description')))
len_udf = udf(lambda x: len(x))
metadatas = metadatas.withColumn('urls_count', len_udf(col('urls')))

# Drop unneeded columns and take only the videos with at least one url
metadatas_urls = metadatas.drop('description').filter(metadatas.urls_count > 0)

metadatas_urls.write.parquet(PATH_METADATAS_URLS_DST, mode="overwrite")

In [23]:
metadatas_urls.select('title', 'urls', 'urls_count').show(5)

+--------------------+--------------------+----------+
|               title|                urls|urls_count|
+--------------------+--------------------+----------+
|Gravity Falls The...|[http://www.zolta...|         8|
|DIY | VALENTINES ...|[iamDonareen@Gmai...|         1|
|Snowboarder hit b...|[http://bit.ly/Ne...|         7|
|Shah Rukh Khan Co...|[http://www.bolly...|         2|
|How Well Do You K...|[http://www.bolly...|         2|
+--------------------+--------------------+----------+
only showing top 5 rows



### Write non-sponsored videos

In [25]:
metadatas = spark.read.json(PATH_METADATAS_SRC)

# Get urls and count of urls in the description
get_urls_udf = udf(lambda x: get_urls(x))
metadatas = metadatas.withColumn('urls', get_urls_udf(col('description')))
len_udf = udf(lambda x: len(x))
metadatas = metadatas.withColumn('urls_count', len_udf(col('urls')))

# Drop unneeded columns and take only the videos with at least one url
metadatas_no_urls = metadatas.drop('description').filter(metadatas.urls_count == 0)

metadatas_no_urls.write.parquet(PATH_METADATAS_NO_URLS_DST, mode="overwrite")

In [26]:
metadatas_no_urls.select('title', 'urls', 'urls_count').show(5)

+--------------------+----+----------+
|               title|urls|urls_count|
+--------------------+----+----------+
|LOLO #finaLDance ...|  []|         0|
|Fresh Revelations...|  []|         0|
|Uncut - Grand Sta...|  []|         0|
|Boardwalk Bar Cra...|  []|         0|
|SKUNK ANANSIE "Be...|  []|         0|
+--------------------+----+----------+
only showing top 5 rows



## Resolve Bitly links (TODO)

We resolve Bitly links to get the original url. This is done using the Bitly API.

In [27]:
BITLY_API_KEY = '6eb2a9c9ec5950c276bf91b89ef2b1f229408807'

PATH_METADATAS_BITLY_DST = '../data/generated/yt_metadata_en_bitly.parquet'

In [None]:
# @cached(cache)
# def resolve_app_adjust_url(url, debug_mode=False):
#     if 'app.adjust.com' not in url:
#         return url
    
#     if debug_mode: print(f'Resolving app adjust url: {url}')
#     try:
#         response = requests.get(url)
        
#         if response.status_code == 200:
#             soup = bs4.BeautifulSoup(response.text, 'html.parser')
#             new_url = soup.find('a', class_='product-header__title app-header__title')
#             if debug_mode: print(f'Resolved adjust.com url: {new_url}')
#             return new_url if new_url is not None else url
#         else:
#             if debug_mode: print(f'Could not resolve adjust.com url: {url}')
#             return url

#     except Exception as e:
#         print(f'Could not resolve url {url}: {e}')
#         return url

In [28]:
@cached(cache)
def resolve_bitly_url(url, debug_mode=False):
    if 'bit.ly' not in url:
        return url

    # Remove http:// or https:// from the url
    url = url.replace('http://', '').replace('https://', '')

    if debug_mode: print(f'Resolving bit.ly url {url}...')
    try:
        # Post a request via the bit.ly API
        response = requests.post('https://api-ssl.bitly.com/v4/expand', headers={'Authorization': f'Bearer {BITLY_API_KEY}'}, json={'bitlink_id': url})

        # Retrieve the long url from the response
        if response.status_code == 200:
            new_url = response.json()['long_url']
            if debug_mode: print(f'\tResolved bit.ly url: {url} -> {new_url}')

            #if re.search(r"app.adjust.com", new_url):
            #    return resolve_app_adjust_url(new_url)
                    
            return new_url
        else:
            if debug_mode: print(f'\tCould not resolve bit.ly url: {url}, status code: {response.status_code}')
            return url

    except Exception as e:
        print(f'Could not resolve url {url}: {e}')
        return url

In [32]:
metadatas_urls = spark.read.parquet(PATH_METADATAS_URLS_DST)
resolve_bitly_url_udf = udf(lambda x: resolve_bitly_url(x))
metadatas_bitly = metadatas_urls.withColumn('urls', explode(col('urls')))
metadatas_bitly.write.parquet(PATH_METADATAS_BITLY_DST, mode="overwrite")

AnalysisException: cannot resolve 'explode(urls)' due to data type mismatch: input to function explode should be array or map type, not string;
'Project [categories#962, channel_id#963, crawl_date#964, dislike_count#965, display_id#966, duration#967L, like_count#968, tags#969, title#970, upload_date#971, view_count#972, explode(urls#973) AS urls#988, urls_count#974]
+- Relation [categories#962,channel_id#963,crawl_date#964,dislike_count#965,display_id#966,duration#967L,like_count#968,tags#969,title#970,upload_date#971,view_count#972,urls#973,urls_count#974] parquet
