In [1]:
import pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = (SparkSession.builder.master("local") 
    .config('spark.sql.autoBroadcastJoinThreshold', -1)  # отключим авто бродкаст
    .config('spark.sql.adaptive.enabled', 'false')
    .getOrCreate())

In [5]:
spark

In [2]:
videos = spark.read.option('header', 'true').option("inferSchema", "true").csv('../datasets/USvideos.csv')
videos.show(5)

+-----------+--------------------+----------------+-----------+--------------------+-------+------+--------+-------------+--------------------+-----+
|   video_id|               title|   channel_title|category_id|                tags|  views| likes|dislikes|comment_total|      thumbnail_link| date|
+-----------+--------------------+----------------+-----------+--------------------+-------+------+--------+-------------+--------------------+-----+
|XpVt6Z1Gjjo|1 YEAR OF VLOGGIN...|Logan Paul Vlogs|         24|logan paul vlog|l...|4394029|320053|    5931|        46245|https://i.ytimg.c...|13.09|
|K4wEI5zhHB0|iPhone X — Introd...|           Apple|         28|Apple|iPhone 10|i...|7860119|185853|   26679|            0|https://i.ytimg.c...|13.09|
|cLdxuaxaQwc|         My Response|       PewDiePie|         22|              [none]|5845909|576597|   39774|       170708|https://i.ytimg.c...|13.09|
|WYYvHb03Eog|Apple iPhone X fi...|       The Verge|         28|apple iphone x ha...|2642103| 24975| 

In [3]:
comments_schema = StructType([ \
    StructField("video_id", StringType(), True), \
    StructField("comment_text", StringType(), True), \
    StructField("likes", IntegerType(), True), \
    StructField("replies", IntegerType(), True)])
comments = spark.read.option('header', 'true').option("mode", "DROPMALFORMED").schema(comments_schema).csv('../datasets/UScomments.csv')
comments.show(5)

+-----------+--------------------+-----+-------+
|   video_id|        comment_text|likes|replies|
+-----------+--------------------+-----+-------+
|XpVt6Z1Gjjo|Logan Paul it's y...|    4|      0|
|XpVt6Z1Gjjo|I've been followi...|    3|      0|
|XpVt6Z1Gjjo|Say hi to Kong an...|    3|      0|
|XpVt6Z1Gjjo| MY FAN . attendance|    3|      0|
|XpVt6Z1Gjjo|         trending 😉|    3|      0|
+-----------+--------------------+-----+-------+
only showing top 5 rows



## Task 1

In [4]:
prepared_comm = comments.groupBy(col('video_id')).agg(sum(col('likes')).alias('sum_comm_likes'), sum(col('replies')).alias('sum_comm_replies'))

In [12]:
prepared_comm.show()

+-----------+--------------+----------------+
|   video_id|sum_comm_likes|sum_comm_replies|
+-----------+--------------+----------------+
|xPS7bqBePSs|          1037|              28|
|dInwVhRtN4E|            63|              13|
|rn5Xgak1zzA|            14|               7|
|TzyraAp3jaY|          1126|              48|
|eHq6ZA6uKOg|           797|             138|
|_r5eTelhpmQ|           749|             123|
|JkqTeQHFoBY|           558|              98|
|Bo-qp-Zu0OY|            71|               8|
|K7pQsR8WFSo|           104|              56|
|g_ekn1gjBq0|            47|              22|
|4yCkkOvIkUI|            36|               4|
|7TN09IP5JuI|            34|               0|
|RE-far-FvRs|            40|               4|
|WQjO1mMCPg4|            61|               5|
|aRgTLb5EbiQ|           735|              29|
|xNddRhpx5tA|            85|               9|
|tUXLO8Dtvq4|            26|               2|
|a7Sf_H2cFdM|            35|              11|
|mGqR9sgMIyA|            79|      

In [14]:
# Бакетинг
prepared_comm.write.bucketBy(16, 'video_id').saveAsTable('comm_bucket', format='csv', mode='overwrite')
videos.write.bucketBy(16, 'video_id').saveAsTable('videos_bucket', format='csv', mode='overwrite')

In [30]:
%time
scored_videos_bucket = (
    spark.read.table('videos_bucket').alias('v')
    .join(spark.read.table('comm_bucket').alias('c'), ['video_id'])
    .withColumn('score', (col('views') + col('likes') + col('comment_total') + col('sum_comm_likes') + col('sum_comm_replies') - col('dislikes')) / 100000)
)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 11.2 µs


In [27]:
# без бакетинга
%time
scored_videos = (
    videos.alias('v')
    .join(prepared_comm.alias('c'), ['video_id'])
    .withColumn('score', (col('views') + col('likes') + col('comment_total') + col('sum_comm_likes') + col('sum_comm_replies') - col('dislikes')) / 100000)
)

CPU times: user 6 µs, sys: 2 µs, total: 8 µs
Wall time: 17.2 µs


На таком маленьком датасете прирост не сильно заметен

In [11]:
scored_videos.limit(5).toPandas()

Unnamed: 0,video_id,title,channel_title,category_id,tags,views,likes,dislikes,comment_total,thumbnail_link,date,sum_comm_likes,sum_comm_replies,score
0,4yCkkOvIkUI,EXCLUSIVE: Zonnique - Patience [Teaser],YBF Chic,24,[none],2306,7,1,0,https://i.ytimg.com/vi/4yCkkOvIkUI/default.jpg,4.1,36,4,0.02352
1,4yCkkOvIkUI,EXCLUSIVE: Zonnique - Patience [Teaser],YBF Chic,24,[none],4937,19,21,12,https://i.ytimg.com/vi/4yCkkOvIkUI/default.jpg,5.1,36,4,0.04987
2,4yCkkOvIkUI,EXCLUSIVE: Zonnique - Patience [Teaser],YBF Chic,24,[none],5662,33,21,13,https://i.ytimg.com/vi/4yCkkOvIkUI/default.jpg,6.1,36,4,0.05727
3,7TN09IP5JuI,Terry Crews Hallucinates While Eating Spicy Wi...,First We Feast,26,First we feast|fwf|firstwefeast|food|food porn...,2513012,130944,1101,19133,https://i.ytimg.com/vi/7TN09IP5JuI/default.jpg,6.1,34,0,26.62022
4,7TN09IP5JuI,Terry Crews Hallucinates While Eating Spicy Wi...,First We Feast,26,First we feast|fwf|firstwefeast|food|food porn...,3866789,168343,1750,24438,https://i.ytimg.com/vi/7TN09IP5JuI/default.jpg,7.1,34,0,40.57854


## Task 2

In [18]:
import json
import pandas as pd
from pyspark.sql.types import StringType

In [27]:
with open("../datasets/US_category_id.json", "r") as write_file:
    cat = json.load(write_file)

items = cat.get('items')

category_dict = {}
for i in items:
    category_dict[i.get('id')] = i.get('snippet').get('title')
    
data_list = set((k, v) for k, v in category_dict.items())

In [30]:
cat

{'kind': 'youtube#videoCategoryListResponse',
 'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJvJAAShlR6hM"',
 'items': [{'kind': 'youtube#videoCategory',
   'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/Xy1mB4_yLrHy_BmKmPBggty2mZQ"',
   'id': '1',
   'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
    'title': 'Film & Animation',
    'assignable': True}},
  {'kind': 'youtube#videoCategory',
   'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/UZ1oLIIz2dxIhO45ZTFR3a3NyTA"',
   'id': '2',
   'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
    'title': 'Autos & Vehicles',
    'assignable': True}},
  {'kind': 'youtube#videoCategory',
   'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/nqRIq97-xe5XRZTxbknKFVe5Lmg"',
   'id': '10',
   'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
    'title': 'Music',
    'assignable': True}},
  {'kind': 'youtube#videoCategory',
   'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/HwXKamM1Q20q9BN-oBJavSGkfDI"',
   'id': '15',
   'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdnt

In [28]:
data_list

{('1', 'Film & Animation'),
 ('10', 'Music'),
 ('15', 'Pets & Animals'),
 ('17', 'Sports'),
 ('18', 'Short Movies'),
 ('19', 'Travel & Events'),
 ('2', 'Autos & Vehicles'),
 ('20', 'Gaming'),
 ('21', 'Videoblogging'),
 ('22', 'People & Blogs'),
 ('23', 'Comedy'),
 ('24', 'Entertainment'),
 ('25', 'News & Politics'),
 ('26', 'Howto & Style'),
 ('27', 'Education'),
 ('28', 'Science & Technology'),
 ('29', 'Nonprofits & Activism'),
 ('30', 'Movies'),
 ('31', 'Anime/Animation'),
 ('32', 'Action/Adventure'),
 ('33', 'Classics'),
 ('34', 'Comedy'),
 ('35', 'Documentary'),
 ('36', 'Drama'),
 ('37', 'Family'),
 ('38', 'Foreign'),
 ('39', 'Horror'),
 ('40', 'Sci-Fi/Fantasy'),
 ('41', 'Thriller'),
 ('42', 'Shorts'),
 ('43', 'Shows'),
 ('44', 'Trailers')}

In [8]:
category_df = spark.createDataFrame(data_list, ['category_id', 'title_name'])

In [9]:
category_df.limit(5).toPandas()

Unnamed: 0,category_id,title_name
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


In [16]:
from pyspark.sql.functions import pandas_udf, udf, PandasUDFType
import numpy as np

In [17]:
# датасет маленький, поэтому лучше использовать бродкаст джоин
categories_score = scored_videos.join(category_df.hint('broadcast'), ['category_id'])

In [18]:
@pandas_udf(FloatType(), PandasUDFType.GROUPED_AGG)
def median__pandas_udf(scores):
    median = np.median(scores)
    return median



In [19]:
%time
medians_df = (
    categories_score
    .groupBy(col('title_name'))
    .agg(median__pandas_udf(col('score')))
)

CPU times: user 4 µs, sys: 2 µs, total: 6 µs
Wall time: 47.4 µs


In [20]:
medians_df.limit(3).toPandas()

Unnamed: 0,title_name,median__pandas_udf(score)
0,Shows,0.086335
1,Education,2.526625
2,Gaming,2.779255


## Task 3

### Тест обычной udf

In [24]:
@udf('array<string>')
def split_tags_udf(tags):
    return tags.split('|')

In [25]:
%time
popular_tags = (
    scored_videos
    .withColumn('tags', explode(split_tags_udf('tags')))
    .groupBy('tags').agg(count('video_id').alias('cnt_tags'))
)

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 43.9 µs


In [26]:
popular_tags.orderBy(desc('cnt_tags')).limit(3).toPandas()

Unnamed: 0,tags,cnt_tags
0,funny,722
1,comedy,572
2,[none],491


### Тест pandas_udf

In [44]:
@pandas_udf('array<string>', PandasUDFType.SCALAR)
def split_tags_pandas_udf(tags):
    return tags.str.split('|')



In [28]:
%time
popular_tags = (
    scored_videos
    .withColumn('tags', explode(split_tags_pandas_udf('tags')))
    .groupBy('tags').agg(count('video_id').alias('cnt_tags'))
)

CPU times: user 18 µs, sys: 4 µs, total: 22 µs
Wall time: 38.4 µs


### Тест scala udf

In [29]:
from pyspark.sql.column import Column
from pyspark.sql.column import _to_java_column 
from pyspark.sql.column import _to_seq

sc = spark.sparkContext

def udf_split_tags_scala_wrapper(tags):
    split_tags_scala = sc._jvm.CustomUDFs.splitTagsUDF()
    return Column(split_tags_scala.apply(_to_seq(sc, [tags], _to_java_column)))

In [30]:
%time
popular_tags = (
    scored_videos
    .withColumn('tags', explode(udf_split_tags_scala_wrapper('tags')))
    .groupBy('tags').agg(count('video_id').alias('cnt_tags'))
)

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 13.6 µs


In [32]:
popular_tags.orderBy(desc('cnt_tags')).limit(3).toPandas()

Unnamed: 0,tags,cnt_tags
0,funny,722
1,comedy,572
2,[none],491


### Вывод
Очевидно самая быстрая реализация udf в Scala, а самая медленная у обычной питоновской udf

## Task 4

In [84]:
cats_df = videos.withColumn('tags', explode(split_tags_pandas_udf('tags'))).filter(col('tags') == 'cat')

Так как мы уменьшили размер выборки, можно попробовать использовать бродкаст джоин

In [110]:
%time
cats_comments = (
    cats_df
    .join(comments.hint('broadcast').alias('c'), ['video_id'])
    .select(col('title'), col('comment_text'), col('c.likes'), )
    .distinct()
    .orderBy(desc('likes'))
)

CPU times: user 6 µs, sys: 2 µs, total: 8 µs
Wall time: 15 µs


И проверим без бродкаста 

In [112]:
%time
cats_comments = (
    cats_df
    .join(comments.alias('c'), ['video_id'])
    .select(col('title'), col('comment_text'), col('c.likes'), )
    .distinct()
    .orderBy(desc('likes'))
)

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 15.7 µs


Прирост на уровне погрешности

In [113]:
cats_comments.limit(5).toPandas()

Unnamed: 0,title,comment_text,likes
0,Cat vs Dog - Best Support Class,The second I read this title in my notificatio...,2355
1,Cat vs Dog - Best Support Class,talk about the ocean sunfish build,1070
2,Cat vs Dog - Best Support Class,talk about the ocean sunfish build,1021
3,Cat vs Dog - Best Support Class,talk about the ocean sunfish build,957
4,9 Things You Need To Know About Kittens - Simo...,I make interesting cartoons and I need your he...,839
