### Sponsors Regression analysis

Regression analysis to try and understand more the effect of sponsors on the popularity of a video.

In [1]:
import re
from pyspark.sql.functions import col, udf, explode
from pyspark.sql.types import FloatType

import numpy as np

from pyspark.sql import SparkSession
import pyspark as ps
import pyspark.sql.functions as F

import math
from statsmodels.stats import diagnostic
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

config = ps.SparkConf().setAll([
    ('spark.network.timeout', '3601s'),
    ('spark.executor.heartbeatInterval', '3600s'),
])
sc = ps.SparkContext('local', '', conf=config)
spark = SparkSession(sc)

22/12/16 11:45:38 WARN Utils: Your hostname, LAPTOP-8QFB5E0N resolves to a loopback address: 127.0.1.1; using 172.28.70.21 instead (on interface eth0)
22/12/16 11:45:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/16 11:45:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
PATH_METADATA_SPONSORS_DOMAINS = 'data/yt_metadata_sponsor_en_domains.parquet'
PATH_METADATAS_DOMAINS_SRC = 'data/yt_metadata_en_domains.parquet'
PATH_ANALYSIS_DF = 'data/generated/sponsor_regression_df.parquet'

In [3]:
sponsor_df = spark.read.parquet(PATH_METADATA_SPONSORS_DOMAINS)
metadat_dt = metadatas_domains = spark.read.parquet(PATH_METADATAS_DOMAINS_SRC)

                                                                                

In [11]:
metadat_dt.printSchema()

root
 |-- categories: string (nullable = true)
 |-- channel_id: string (nullable = true)
 |-- dislike_count: integer (nullable = true)
 |-- display_id: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- like_count: integer (nullable = true)
 |-- tags: string (nullable = true)
 |-- title: string (nullable = true)
 |-- upload_date: date (nullable = true)
 |-- view_count: long (nullable = true)
 |-- domains: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- domains_count: integer (nullable = true)
 |-- has_domains: string (nullable = true)



In [12]:
sponsor_df.printSchema()

root
 |-- display_id: string (nullable = true)
 |-- domains: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- domain_categories: string (nullable = true)
 |-- is_sponsored: boolean (nullable = true)



In [4]:
joined_df = sponsor_df.join(metadat_dt, on=["display_id","domains"], how="inner")         

In [5]:
joined_df.printSchema()

root
 |-- display_id: string (nullable = true)
 |-- domains: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- domain_categories: string (nullable = true)
 |-- is_sponsored: boolean (nullable = true)
 |-- categories: string (nullable = true)
 |-- channel_id: string (nullable = true)
 |-- dislike_count: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- like_count: integer (nullable = true)
 |-- tags: string (nullable = true)
 |-- title: string (nullable = true)
 |-- upload_date: date (nullable = true)
 |-- view_count: long (nullable = true)
 |-- domains_count: integer (nullable = true)
 |-- has_domains: string (nullable = true)



In [5]:
# Filling the missing informations for likes/dislikes
joined_df = joined_df.fillna(0,subset='dislike_count') \
                     .fillna(0,subset='like_count')

In [6]:
# Th metric we will use to assess the "popularity" of a video
joined_df = joined_df.withColumn('like_per_view', joined_df.like_count/ joined_df.view_count)

In [7]:
joined_df = joined_df.withColumn('dislike_per_view', joined_df.dislike_count / joined_df.view_count)
joined_df = joined_df.fillna(0,subset='dislike_per_view')

In [12]:
df_reg = joined_df.select(joined_df['is_sponsored'],joined_df['like_per_view'])

                                                                                

In [None]:
# Run this cell to write the parquet file regression_urls with the columns you want to fit on :
df_reg.write.parquet(PATH_ANALYSIS_DF)

In [13]:
import pyarrow.parquet as pq
import statsmodels.formula.api as smf

class DataSet(dict):
    def __init__(self, path):
        self.parquet = pq.ParquetDataset(path)

    def __getitem__(self, key):
        try:
            return self.parquet.read([key]).to_pandas()[key]
        except:
            raise KeyError

pd_df_reg = DataSet(PATH_ANALYSIS_DF)

In [14]:
pd_df_reg["is_sponsored"] = pd_df_reg["is_sponsored"].astype(int)

In [15]:
mod = smf.ols('like_per_view ~ C(is_sponsored)', data=pd_df_reg)
np.random.seed(2)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:          like_per_view   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     8377.
Date:                Fri, 16 Dec 2022   Prob (F-statistic):               0.00
Time:                        12:06:53   Log-Likelihood:            -1.5027e+06
No. Observations:            35516145   AIC:                         3.005e+06
Df Residuals:                35516143   BIC:                         3.005e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

We observe that the p-values and confidence intervals are coherent (p-value < 0.05); If we take the entire dataset, we observe that in average we have : ${LikePerView = 0.0212 + IsSponsored*0.0096}$ 


We can thus see that on average the fact that a video is sponsored account for $\frac{0.0096}{0.0212}*100 = 45$% of the likes per view on a video if we consider this simple model. 


Although we get a pretty positive analysis for the effect of sponsorship on videos we still need to keep in mind that we are only using a simple model and that lots of non-identified confounder could have some impact on the "popularity" metrics we use, just as a simple exemple we could take a look at the "popularity" of creator that tells their audience to "like, comment and subscribe" versus those that don't.

Another key point that needs to be talked upon is akin to the question of the egg and the chicken, is the fact that sponsored videos tends to do better than un-sponsored one because the creator was already popular at the beginning or because sponsors have a significant effects, we will now try to see the effet at the individual channel level.

In [11]:
#Since we will work at the channel level, we groupby each individual channel (defined by its id)
gb_channel = joined_df.groupby('channel_id')

In [18]:
list_channel = []
list_channel = joined_df.select('channel_id').distinct().collect()
len(list_channel)

                                                                                

117766

22/12/16 12:46:41 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 3659698 ms exceeds timeout 3601000 ms
22/12/16 12:46:41 WARN SparkContext: Killing executors is not supported by current scheduler.


In [None]:
# Now we do the regression bqsed on the datas for each channel to get the coefficient we want
list_coeff = []
for channel in list_channel : 
    reg_channel = joined_df.where(df.channel_id == channel) \
                           .select(joined_df['is_sponsored'],joined_df['like_per_view'])
    
    pd_reg_channel = reg_channel.toPandas()
    pd_reg_channel["is_sponsored"] = pd_reg_channel["is_sponsored"].astype(int)
    
    mod = smf.ols('like_per_view ~ C(is_sponsored)', data=pd_reg_channel)
    np.random.seed(2)
    res = mod.fit()
    
    list_coeff.append(res.params.values[1])

In [None]:
plt.style.use('ggplot2')

plt.hist(list_coeff)
plt.xlabel("Value of the coefficient of 'is_sponsored'")
plt.ylabel("Counts")
plt.show()

interprétation de l'hist => heavy tailed