In [1]:
import findspark
findspark.init()

import re
from pyspark.sql.functions import col, udf, explode, when
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, LongType, StringType, DateType, ArrayType, BooleanType

from pyspark.sql import SparkSession
import pyspark as ps
config = ps.SparkConf().setAll([
    ('spark.network.timeout', '3601s'),
    ('spark.executor.heartbeatInterval', '3600s'),
])
sc = ps.SparkContext('local[*]', '', conf=config)
spark = SparkSession(sc)

In [3]:
from urllib.parse import urlparse
import pyspark.sql.functions as f

In [4]:
FILE = "../data/generated/yt_metadata_en_urls.parquet"
schema = StructType([
    StructField("categories",    StringType(),  True),
    StructField("channel_id",    StringType(),  True),
    StructField("crawl_date",    DateType(),    True),
    StructField("dislike_count", DoubleType(),  True), # This field must be specified as a double as it is represented as a floating point number
    StructField("display_id",    StringType(),  True),
    StructField("duration",      IntegerType(), True),
    StructField("like_count",    DoubleType(),  True), # This field must be specified as a double as it is represented as a floating point number
    StructField("tags",          StringType(),  True),
    StructField("title",         StringType(),  True),
    StructField("upload_date",   DateType(),    True),
    StructField("view_count",    DoubleType(),  True),  # This field must be specified as a double as it is represented as a floating point number
    StructField("urls", ArrayType(StringType()), True),
    StructField("urls_count", IntegerType(), True),
    StructField("has_urls", BooleanType(), True),
])


In [5]:
df = spark.read.parquet(FILE, schema=schema)
# Cast the dislike_count, like_count and view_count to their respective integer type
df = df \
    .withColumn("dislike_count", df.dislike_count.cast(IntegerType())) \
    .withColumn("like_count", df.like_count.cast(IntegerType())) \
    .withColumn("view_count", df.view_count.cast(LongType()))\
        
df_urls = df.where(df["has_urls"] == "true")

In [6]:
df_urls.groupby('urls')

<pyspark.sql.group.GroupedData at 0x1cd42a5a160>

In [8]:
def isolate_name_site(list_site):
    shorten_name = []
    for i, site in enumerate(list_site):
        link = urlparse(site)[1]
        if ((link != 'spoti.fi') and (link != 'goo.gl')):

            shorten_name.append(link)
    
    return shorten_name
my_udf = f.udf(isolate_name_site, ArrayType(StringType()))

In [9]:
df_urls = df_urls.withColumn('urls_shorten', my_udf(f.col("urls")))

In [10]:
df_long = df_urls.withColumn("list_site", explode(df_urls.urls_shorten))


In [11]:
list_site_frequency = df_long.groupBy("list_site").count().filter("`count` >= 10000").sort(col("count").desc())
list_site_frequency.show()