### Adding the sponsors and their categories to our big dataset

In this notebook we aim to add a column called "is_sponsored" which is 1 if the domains contained in the video's description are a sponsor or not, based on our classification of the first 1000 more referenced domains, and 0 otherwise. Then we also want to add a column "sponsor_categories" which will contain the categories of the sponsors present in a video's description.

In [1]:
import re
from pyspark.sql.functions import col, udf, explode, collect_list
from pyspark.sql.types import FloatType, BooleanType, StringType

import numpy as np

from pyspark.sql import SparkSession
import pyspark as ps

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

import pandas as pd

from pyspark.sql import SparkSession
import pyspark as ps
config = ps.SparkConf()
config.set('spark.executor.heartbeatInterval', '3600s')
config.set('spark.network.timeout', '7200s')
config.set('spark.driver.memory', '16g') # Increase the driver memory to avoid HeapOverflow when repartitioning.
sc = ps.SparkContext('local[*]', '', conf=config) # write 'local' for single-threaded execution and 'local[*]' for multi-threaded execution.
spark = SparkSession(sc)

In [2]:
PATH_METADATAS_DOMAINS_SRC = 'data/generated/yt_metadata_en_domains.parquet'
PATH_DOMAIN_CLASSIFICATION = "data/domains_classification.csv"

In [3]:
metadatas_domains = spark.read.parquet(PATH_METADATAS_DOMAINS_SRC)

# Get a dict of sponsored domains with their category
domain_class = pd.read_csv(PATH_DOMAIN_CLASSIFICATION)
domain_class = domain_class[domain_class.is_sponsored == 1].set_index('domain')['domain_category'].to_dict()

In [4]:
metadatas_domains.select('display_id', 'domains').show(10, False)

+-----------+----------------------------------------------------------------------------------------------------------+
|display_id |domains                                                                                                   |
+-----------+----------------------------------------------------------------------------------------------------------+
|Gt_r6SrOxv8|[tipeeestream.com]                                                                                        |
|xNoMrPUlBzw|[creativecommons.org]                                                                                     |
|HBtx33c7WDE|[ebay.com, fxo.co, cafepress.com, ebay.com, tinyurl.com, ebay.com, ebay.com, ebay.com, ebay.com, ebay.com]|
|r5JH6DDTpV8|[ebay.com, fxo.co, cafepress.com, ebay.com, ebay.com, ebay.com, ebay.com, tinyurl.com]                    |
|0ZoegwnAF_w|[whatcar.com, whatcar.com, whatcar.com]                                                                   |
|d2mfhzj1aOI|[whatcar.com, whatc

In [5]:
def domain_is_sponsor(domain):
    return domain in domain_class

def get_domain_category(domain):
    if domain in domain_class:
        return domain_class[domain]
    else:
        return None

domain_is_sponsor_udf = udf(domain_is_sponsor, BooleanType())
get_domain_category_udf = udf(get_domain_category, StringType())

# We first explode de domains and add the missing information about sponsorships for each domain in each video
metadatas_domains = metadatas_domains.withColumn("domain", explode("domains")) \
    .drop("domains")
metadatas_domains = metadatas_domains.withColumn("is_sponsored", domain_is_sponsor_udf("domain")) \
    .withColumn('domain_category', get_domain_category_udf("domain")) \

# We regroup the information back together to have a list of distinct sponsored domains
# and a sponsor flag for each video if at least one of the domains is sponsored
metadatas_domains = metadatas_domains.groupby("display_id") \
    .agg(
        collect_list("domain").alias("domains"),
        collect_list("domain_category").alias("domain_categories"),
        collect_list("is_sponsored").alias("is_sponsoreds")
    ) \
    .withColumn("is_sponsored", udf(lambda x: any(x), BooleanType())("is_sponsoreds")) \
    .withColumn("domain_categories", udf(lambda x: list(set(x)), StringType())("domain_categories")) \
    .drop("domain", "domain_category", "is_sponsoreds")

In [11]:
metadatas_domains.select('display_id', 'domains', 'is_sponsored', 'domain_categories').filter(metadatas_domains.is_sponsored).show(10)

+-----------+--------------------+------------+--------------------+
| display_id|             domains|is_sponsored|   domain_categories|
+-----------+--------------------+------------+--------------------+
|---jqfcks4Y|      [gamewisp.com]|        true|            [Agency]|
|--1udHoGWFY|[rstyle.me, rstyl...|        true|[Agency, Beauty, ...|
|--6bIzrgY3I|[play.google.com,...|        true|       [Application]|
|--7kvpZtKDE|    [streamlabs.com]|        true|       [Application]|
|--8bLO31olY|[google.com, even...|        true|            [Agency]|
|--BJEQK3lZ8| [doubtnut.app.link]|        true|       [Application]|
|--Bu6_czcUw|[play.google.com,...|        true|       [Application]|
|--D9SNFXu4k|[apps.apple.com, ...|        true|       [Application]|
|--F-0qN4gXU|[psnprofiles.com,...|        true|              [Shop]|
|--HsbU1M3rY|[play.google.com,...|        true|       [Application]|
+-----------+--------------------+------------+--------------------+
only showing top 10 rows



In [12]:
metadatas_domains.select('display_id', 'domains', 'is_sponsored', 'domain_categories').filter(~metadatas_domains.is_sponsored).show(10)

+-----------+--------------------+------------+-----------------+
| display_id|             domains|is_sponsored|domain_categories|
+-----------+--------------------+------------+-----------------+
|---1Yr75DDo|[pond5.com, foota...|       false|               []|
|---SLoSKUoc|[plus.google.com,...|       false|               []|
|---y_F0JOYY|[worldbossteam.co...|       false|               []|
|--1mZyOoA9o|[TheMelaninCode.c...|       false|               []|
|--1wdNZnvY8|[roblox.com, robl...|       false|               []|
|--2aaCq_Rg8|[store.playstatio...|       false|               []|
|--2lK1tBnY4|         [newsx.com]|       false|               []|
|--2nX2IK1fY|      [musicfog.com]|       false|               []|
|--34oO-c1zA|[wecrafter.com, w...|       false|               []|
|--3XHHGVZTg|[bhaarattoday.com...|       false|               []|
+-----------+--------------------+------------+-----------------+
only showing top 10 rows



In [14]:
PATH_METADATAS_SPONSOR_SRC = 'data/generated/yt_metadata_sponsor_en_domains.parquet'

In [16]:
# Delete the output folder if it already exists
#!rm -f $PATH_METADATAS_SPONSOR_SRC # Linux
!PowerShell.exe -Command "Remove-Item -Path $PATH_METADATAS_SPONSOR_SRC -Recurse -Force" # Windows

In [17]:
metadatas_domains.write.parquet(PATH_METADATAS_SPONSOR_SRC)