### Adding the sponsors and their categories to our big dataset

In this notebook we aim to add a column called "is_sponsored" which is 1 if the domains contained in the video's description are a sponsor or not, based on our classification of the first 1000 more referenced domains, and 0 if it isn't. Then we also want to add a column "sponsor_category" which will contain the categories of the sponsorss present in a video's description.

In [1]:
import re
from pyspark.sql.functions import col, udf, explode
from pyspark.sql.types import FloatType

import numpy as np

from pyspark.sql import SparkSession
import pyspark as ps
import pyspark.sql.functions as F

import math
from statsmodels.stats import diagnostic
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

import pandas as pd

config = ps.SparkConf().setAll([
    ('spark.network.timeout', '3601s'),
    ('spark.executor.heartbeatInterval', '3600s'),
])
sc = ps.SparkContext('local', '', conf=config)
spark = SparkSession(sc)

22/12/11 16:14:10 WARN Utils: Your hostname, LAPTOP-8QFB5E0N resolves to a loopback address: 127.0.1.1; using 172.31.43.89 instead (on interface eth0)
22/12/11 16:14:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/11 16:14:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
PATH_METADATAS_DOMAINS_SRC = 'data/yt_metadata_en_domains.parquet'
PATH_DOMAIN_CLASSIFICATION = "data/domains_classification.csv"

In [3]:
metadatas_domains = spark.read.parquet(PATH_METADATAS_DOMAINS_SRC)
domain_class = pd.read_csv(PATH_DOMAIN_CLASSIFICATION)

                                                                                

In [4]:
# We explode to consider one domain at a time
metadatas_domains = metadatas_domains.withColumn("domains", explode("domains"))

In [5]:
# we add the column is_sponsored in our df if we consider the domain domain as a sponsor 
metadatas_domains = metadatas_domains.withColumn("is_sponsored", F.when((metadatas_domains.domains) \
                                            .isin(list(domain_class.query('is_sponsored == 1')['domain'])),1).otherwise(0))

In [6]:
domain_class.head()

Unnamed: 0,domain,count,median_sponsor_score,is_sponsored,domain_category
0,plus.google.com,3786647,3.0,0,Social Media
1,play.google.com,2093786,4.0,1,Application
2,itunes.apple.com,1724603,3.0,0,Music
3,bit.ly,1408414,3.0,0,Shorten
4,tinyurl.com,1380122,4.0,0,Shorten


In [7]:
#extract list of cagories :
list_cat = domain_class["domain_category"].unique()
list_cat

array(['Social Media', 'Application', 'Music', 'Shorten', 'Agency',
       'Video Game', 'Education', nan, 'Sharing Service', 'Technology',
       'OVP', 'Fashion', 'Media', 'Shop', 'Sport', 'Streaming',
       'Donation', 'Vehicle', 'Book', 'Food Drink', 'Beauty', 'Finance',
       'Real Estate', 'Entertainment', 'Photo', 'Service', 'YCW',
       'Dating', 'Science', 'Movie', 'Radio', 'Forum', 'Crypto',
       'Religion', 'Travel', 'Medical', 'Government', 'TV', 'OVM'],
      dtype=object)

In [8]:
# Chained when (https://stackoverflow.com/questions/72865766/chain-several-when-conditions-in-a-scalable-way-in-pyspark)
# too add the category of each sponsor in our df
whens = F

for cat in list_cat:
    whens = whens.when(F.col("domains").isin(list(domain_class.query('is_sponsored == 1 and domain_category == @cat') \
                                                  ['domain'])), cat)
whens = whens.otherwise("Not_Sponsor") 

metadatas_domains = metadatas_domains.withColumn("sponsor_category", whens)

In [9]:
# we can see that we have some columns duplicates
metadatas_domains.schema.names

['categories',
 'channel_id',
 'dislike_count',
 'display_id',
 'duration',
 'like_count',
 'tags',
 'title',
 'upload_date',
 'view_count',
 'domains',
 'domains_count',
 'has_domains',
 'is_sponsored',
 'sponsor_category']

In [10]:
# we inverse the explode we did a bit earlier
metadatas_sponsors = metadatas_domains.groupby([c for c in metadatas_domains.columns if c != 'domains' and c != 'sponsor_category']) \
                                                        .agg(*( [F.collect_list("domains").alias("domains")]  \
                                                        + [F.collect_list("sponsor_category").alias("sponsor_category")]))


In [11]:
# we verify
metadatas_sponsors.schema.names

['categories',
 'channel_id',
 'dislike_count',
 'display_id',
 'duration',
 'like_count',
 'tags',
 'title',
 'upload_date',
 'view_count',
 'domains_count',
 'has_domains',
 'is_sponsored',
 'domains',
 'sponsor_category']

In [None]:
# we export our df
metadatas_sponsors.write.parquet("data/metadata_sponsors.parquet")

