## Add Features to the Metadata Dataset

In this notebook, we add new features together to continue our analysis. We have the following features:
* `diff_month`: the number of elapsed months from the creation of the first video.
* `domains`: the domains of the video.
* `comain_categories`: the categories of the sponsored domains of the video.
* `is_sponsored`: whether the video is sponsored or not (i.e. contains sponsored domains).

In [1]:
from pyspark.sql.types import StructField, StructType, FloatType, BooleanType, StringType, DateType, IntegerType, DoubleType, ArrayType, LongType
from pyspark.sql.functions import udf, lit, explode, collect_list
from pyspark.sql import SparkSession
import pyspark as ps

import pandas as pd

from pyspark.sql import SparkSession
import pyspark as ps
config = ps.SparkConf()
config.set('spark.executor.heartbeatInterval', '3600s')
config.set('spark.network.timeout', '7200s')
config.set('spark.driver.memory', '16g') # Increase the driver memory to avoid HeapOverflow when repartitioning.
sc = ps.SparkContext('local[*]', '', conf=config) # write 'local' for single-threaded execution and 'local[*]' for multi-threaded execution.
spark = SparkSession(sc)

### Add Sponsored Domains to Videos

In [2]:
PATH_METADATAS_DOMAINS_SRC = 'data/generated/yt_metadata_en_domains.parquet'
PATH_DOMAIN_CLASSIFICATION = "data/domains_classification.csv"

In [3]:
metadatas_domains = spark.read.parquet(PATH_METADATAS_DOMAINS_SRC)

# Get a dict of sponsored domains with their category
domain_class = pd.read_csv(PATH_DOMAIN_CLASSIFICATION)
domain_class = domain_class[domain_class.is_sponsored == 1].set_index('domain')['domain_category'].to_dict()

In [4]:
metadatas_domains.select('display_id', 'domains').show(10, False)

+-----------+----------------------------------------------------------------------------------------------------------+
|display_id |domains                                                                                                   |
+-----------+----------------------------------------------------------------------------------------------------------+
|Gt_r6SrOxv8|[tipeeestream.com]                                                                                        |
|xNoMrPUlBzw|[creativecommons.org]                                                                                     |
|HBtx33c7WDE|[ebay.com, fxo.co, cafepress.com, ebay.com, tinyurl.com, ebay.com, ebay.com, ebay.com, ebay.com, ebay.com]|
|r5JH6DDTpV8|[ebay.com, fxo.co, cafepress.com, ebay.com, ebay.com, ebay.com, ebay.com, tinyurl.com]                    |
|0ZoegwnAF_w|[whatcar.com, whatcar.com, whatcar.com]                                                                   |
|d2mfhzj1aOI|[whatcar.com, whatc

In [5]:
def domain_is_sponsor(domain):
    return domain in domain_class

def get_domain_category(domain):
    if domain in domain_class:
        return domain_class[domain]
    else:
        return None

domain_is_sponsor_udf = udf(domain_is_sponsor, BooleanType())
get_domain_category_udf = udf(get_domain_category, StringType())

# We first explode de domains and add the missing information about sponsorships for each domain in each video
metadatas_domains = metadatas_domains.withColumn("domain", explode("domains")) \
    .drop("domains")
metadatas_domains = metadatas_domains.withColumn("is_sponsored", domain_is_sponsor_udf("domain")) \
    .withColumn('domain_category', get_domain_category_udf("domain")) \

# We regroup the information back together to have a list of distinct sponsored domains
# and a sponsor flag for each video if at least one of the domains is sponsored
metadatas_domains = metadatas_domains.groupby("display_id") \
    .agg(
        collect_list("domain").alias("domains"),
        collect_list("domain_category").alias("domain_categories"),
        collect_list("is_sponsored").alias("is_sponsoreds")
    ) \
    .withColumn("is_sponsored", udf(lambda x: any(x), BooleanType())("is_sponsoreds")) \
    .withColumn("domain_categories", udf(lambda x: list(set(x)), StringType())("domain_categories")) \
    .drop("domain", "domain_category", "is_sponsoreds")

In [11]:
metadatas_domains.select('display_id', 'domains', 'is_sponsored', 'domain_categories').filter(metadatas_domains.is_sponsored).show(10)

+-----------+--------------------+------------+--------------------+
| display_id|             domains|is_sponsored|   domain_categories|
+-----------+--------------------+------------+--------------------+
|---jqfcks4Y|      [gamewisp.com]|        true|            [Agency]|
|--1udHoGWFY|[rstyle.me, rstyl...|        true|[Agency, Beauty, ...|
|--6bIzrgY3I|[play.google.com,...|        true|       [Application]|
|--7kvpZtKDE|    [streamlabs.com]|        true|       [Application]|
|--8bLO31olY|[google.com, even...|        true|            [Agency]|
|--BJEQK3lZ8| [doubtnut.app.link]|        true|       [Application]|
|--Bu6_czcUw|[play.google.com,...|        true|       [Application]|
|--D9SNFXu4k|[apps.apple.com, ...|        true|       [Application]|
|--F-0qN4gXU|[psnprofiles.com,...|        true|              [Shop]|
|--HsbU1M3rY|[play.google.com,...|        true|       [Application]|
+-----------+--------------------+------------+--------------------+
only showing top 10 rows



In [12]:
metadatas_domains.select('display_id', 'domains', 'is_sponsored', 'domain_categories').filter(~metadatas_domains.is_sponsored).show(10)

+-----------+--------------------+------------+-----------------+
| display_id|             domains|is_sponsored|domain_categories|
+-----------+--------------------+------------+-----------------+
|---1Yr75DDo|[pond5.com, foota...|       false|               []|
|---SLoSKUoc|[plus.google.com,...|       false|               []|
|---y_F0JOYY|[worldbossteam.co...|       false|               []|
|--1mZyOoA9o|[TheMelaninCode.c...|       false|               []|
|--1wdNZnvY8|[roblox.com, robl...|       false|               []|
|--2aaCq_Rg8|[store.playstatio...|       false|               []|
|--2lK1tBnY4|         [newsx.com]|       false|               []|
|--2nX2IK1fY|      [musicfog.com]|       false|               []|
|--34oO-c1zA|[wecrafter.com, w...|       false|               []|
|--3XHHGVZTg|[bhaarattoday.com...|       false|               []|
+-----------+--------------------+------------+-----------------+
only showing top 10 rows



### Add Months to Videos

In [6]:
def diff_month(d1,d2):
    
    return (d1.year - d2.year) * 12 + d1.month - d2.month

diff_month_udf = udf(lambda d1,d2: diff_month(d1,d2), IntegerType())
start_date = metadatas_domains.select('upload_date').agg({'upload_date': 'min'}).collect()[0][0]
metadatas_domains = metadatas_domains.withColumn('diff_month', diff_month_udf('upload_date', lit(start_date)))

In [7]:
metadatas_domains.show(10)

+-----------+--------------------+-----------------+------------+----------------+--------------------+----------+--------------------+-------------+--------+----------+--------------------+--------------------+-----------+----------+----------+
| display_id|             domains|domain_categories|is_sponsored|      categories|          channel_id|crawl_date|         description|dislike_count|duration|like_count|                tags|               title|upload_date|view_count|diff_month|
+-----------+--------------------+-----------------+------------+----------------+--------------------+----------+--------------------+-------------+--------+----------+--------------------+--------------------+-----------+----------+----------+
|---1Yr75DDo|[pond5.com, foota...|               []|       false|Film & Animation|UCarI394EclOjlRrm...|2019-11-18|http://www.pond5....|            2|      47|        11|free Chroma Key E...|Explosion 1 - gre...| 2012-07-14|      3708|        84|
|---SLoSKUoc|[pl

In [9]:
PATH_ALL_METADATAS_SRC = '../data/generated/all_metadatas_en.parquet'

metadatas_domains.write.parquet(PATH_ALL_METADATAS_SRC)