# Process links in video descriptions

This notebook processes links in video descriptions. Our goal is to extract URLs that may be related to sponsors, hence giving us a way to identify sponsorships in videos.

In [1]:
import re
from pyspark.sql.functions import col, udf, explode
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, LongType, StringType, DateType, ArrayType, BooleanType

from pyspark.sql import SparkSession
import pyspark as ps
config = ps.SparkConf().setAll([
    ('spark.network.timeout', '3601s'),
    ('spark.executor.heartbeatInterval', '3600s'),
])
sc = ps.SparkContext('local', '', conf=config)
spark = SparkSession(sc)

In [2]:
USE_SUBDATASETS = False

PATH_METADATAS_SRC = '../data/subdata/yt_metadata_en_sub' if USE_SUBDATASETS else '../data/yt_metadata_en.jsonl'

## Extract links from video descriptions

Firstly, we extract links using regular expressions. We perform a first filtering to remove urls from sites that are generally not related to sponsors such as Youtube, Twitter, Facebook, Wikipedia, Discord, etc...

In [3]:
PATH_METADATAS_URLS_DST = '../data/generated/yt_metadata_en_urls.parquet'

In [4]:
# Load the invalid URLs
PATH_INVALID_URLS = '../data/invalid_urls.csv'

invalid_urls_reg = []
with open(PATH_INVALID_URLS, 'r') as f:
    for line in f:
        invalid_urls_reg.append(fr"(?i)({line.strip()})")

In [5]:
def get_urls(description):
    if description is None:
        return []
        
    url_regex = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
    urls = []
    for line in description.split("\n"):
        if re.search(url_regex, line):
            urlss = [x.group() for x in re.finditer(url_regex, line)] # Find all urls in the line
            urlss = [x for x in urlss if not any([re.search(reg, x) for reg in invalid_urls_reg])] # Filter out invalid urls
            urls.extend(urlss)
    urls = list(set(urls)) # Remove duplicates
    return urls

# Test the function on basic examples
assert(get_urls('There is no link.') == [])
assert(get_urls('This is not a valid link: www.special.com/') == [])
assert(get_urls('This is a link: https://www.special.com/ and this is another link: https://www.youtube.com/watch?v=2') == ['https://www.special.com/'])
assert(get_urls('This is a link: https://www.special.com/ \n and this is an invalid link: www.special.com/') == ['https://www.special.com/'])

In [6]:
# Delete the output folder if it already exists
#!rm -f $PATH_METADATAS_URLS_DST # Linux
!PowerShell.exe -Command "Remove-Item -Path $PATH_METADATAS_URLS_DST -Recurse -Force" # Windows

Remove-Item : Impossible de trouver le chemin d'acc�s �
C:\Users\admin\Documents\ADA\Project\data\generated\yt_metadata_en_urls.parquet
�, car il n'existe pas.
Au caract�re Ligne:1 : 1
+ Remove-Item -Path ../data/generated/yt_metadata_en_urls.parquet -Recu ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    + CategoryInfo          : ObjectNotFound: (C:\Users\admin\...en_urls.parqu 
   et:String) [Remove-Item], ItemNotFoundException
    + FullyQualifiedErrorId : PathNotFound,Microsoft.PowerShell.Commands.Remov 
   eItemCommand
 


In [7]:
schema = StructType([
    StructField("categories",    StringType(),  True),
    StructField("channel_id",    StringType(),  True),
    StructField("crawl_date",    DateType(),    True),
    StructField("description",   StringType(),  True),
    StructField("dislike_count", DoubleType(),  True), # This field must be specified as a double as it is represented as a floating point number
    StructField("display_id",    StringType(),  True),
    StructField("duration",      IntegerType(), True),
    StructField("like_count",    DoubleType(),  True), # This field must be specified as a double as it is represented as a floating point number
    StructField("tags",          StringType(),  True),
    StructField("title",         StringType(),  True),
    StructField("upload_date",   DateType(),    True),
    StructField("view_count",    DoubleType(),  True)  # This field must be specified as a double as it is represented as a floating point number
])
    
metadatas = spark.read.json(PATH_METADATAS_SRC, schema=schema)

# Cast the dislike_count, like_count and view_count to their respective integer type
metadatas = metadatas \
    .withColumn("dislike_count", metadatas.dislike_count.cast(IntegerType())) \
    .withColumn("like_count", metadatas.like_count.cast(IntegerType())) \
    .withColumn("view_count", metadatas.view_count.cast(LongType()))

# Get urls and count of urls in the description
get_urls_udf = udf(lambda x: get_urls(x), ArrayType(StringType()))
metadatas = metadatas.withColumn('urls', get_urls_udf(col('description')))
len_udf = udf(lambda x: len(x), IntegerType())
metadatas = metadatas.withColumn('urls_count', len_udf(col('urls')))
has_urls_udf = udf(lambda x: x > 0, BooleanType())
metadatas = metadatas.withColumn('has_urls', has_urls_udf(col('urls_count')))

# Drop the description and take only the videos with at least one url
metadatas_urls = metadatas.drop('description')

In [8]:
metadatas_urls.filter(metadatas_urls.has_urls).select('display_id', 'urls', 'urls_count').show(5)

+-----------+--------------------+----------+
| display_id|                urls|urls_count|
+-----------+--------------------+----------+
|VPqJmODeZyk|[https://goo.gl/J...|         1|
|xLYGF-aCEHk|[https://goo.gl/J...|         1|
|TqzfdwSZdRc|[https://goo.gl/J...|         1|
|C-dn3p-ZTrM|[https://goo.gl/J...|         1|
|OP_njme3T84|[https://goo.gl/J...|         1|
+-----------+--------------------+----------+
only showing top 5 rows



In [10]:
metadatas_urls.filter(~metadatas_urls.has_urls).select('display_id', 'urls', 'urls_count').show(5)

+-----------+----+----------+
| display_id|urls|urls_count|
+-----------+----+----------+
|SBqSc91Hn9g|  []|         0|
|UuugEl86ESY|  []|         0|
|oB4c-yvnbjs|  []|         0|
|ZaV-gTCMV8E|  []|         0|
|cGvL7AvMfM0|  []|         0|
+-----------+----+----------+
only showing top 5 rows



## Resolve Bitly URLs

To further enrich our dataset, we write in a separate file all the Bitly URLs found in the dataset. We will then use various tools such as the Bitly API, or simple HTTP requests to resolve the URLs.

In [11]:
PATH_BITLY_URLS = '../data/generated/bitly_urls.tsv'

In [12]:
# Delete the output folder if it already exists
#!rm -f $PATH_BITLY_URLS # Linux
!PowerShell.exe -Command "Remove-Item -Path $PATH_BITLY_URLS -Recurse -Force" # Windows

In [13]:
bitly_urls = metadatas_urls.select('display_id', explode('urls').alias('url'))
bitly_urls = bitly_urls.filter(bitly_urls.url.like('%bit.ly%')).select('url').distinct()

bitly_urls.write.csv(PATH_BITLY_URLS, sep='\t', header=False)

We now assume that the **Bitly URLs have been resolved** and we can replace them by their corresponding URLs.

In [None]:
PATH_BITLY_RESOLVED_URLS = '../data/generated/bitly_resolved_urls.tsv'

In [None]:
def find_resolved_bitly_url(url, resolved_bitly_urls):
    if url in resolved_bitly_urls:
        return resolved_bitly_urls[url]
    else:
        return url



def find_resolved_bitly_urls(urls, resolved_bitly_urls):
    return [find_resolved_bitly_url(url, resolved_bitly_urls) for url in urls]

In [None]:
# Load the resolved bit.ly urls
resolved_bitly_urls = {}
with open(PATH_BITLY_RESOLVED_URLS, 'r') as f:
    for line in f:
        if line.startswith('bitly_url'):
            continue # Skip header

        bitly_url, long_url = line.strip().split('\t')
        if long_url is not None and long_url != '':
            resolved_bitly_urls[bitly_url] = long_url

print(f'Found {len(resolved_bitly_urls)} resolved bit.ly urls.')

In [None]:
# Retrieve the resolved bit.ly urls
find_resolved_bitly_urls_udf = udf(lambda x: find_resolved_bitly_urls(x, resolved_bitly_urls), ArrayType(StringType()))
metadatas_urls = metadatas_urls.withColumn('resolved_urls', find_resolved_bitly_urls_udf(col('urls')))

# Replace the old urls column with the new one
metadatas_urls = metadatas_urls.drop('urls').withColumnRenamed('resolved_urls', 'urls')

In [None]:
metadatas_urls.select('display_id', 'urls', 'urls_count').show(10)

In [None]:
metadatas_urls.write.parquet(PATH_METADATAS_URLS_DST, partitionBy=['has_urls'])