# What are the most popular shows by language

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import desc, asc, col, avg, round, sum, max, min, mean, count, filter, isnan, when, regexp_replace

spark = SparkSession.builder.appName("local_spark").getOrCreate()

Reads file

In [2]:
df = spark.read.json('tv_shows.json')

Prints Schema

In [3]:
df.printSchema()

root
 |-- _links: struct (nullable = true)
 |    |-- nextepisode: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |    |-- previousepisode: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |    |-- self: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |-- averageRuntime: long (nullable = true)
 |-- dvdCountry: struct (nullable = true)
 |    |-- code: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- timezone: string (nullable = true)
 |-- ended: string (nullable = true)
 |-- externals: struct (nullable = true)
 |    |-- imdb: string (nullable = true)
 |    |-- thetvdb: long (nullable = true)
 |    |-- tvrage: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: long (nullable = true)
 |-- image: struct (nullable = true)
 |    |-- medium: string (nullable = true)
 |    |-- original: string (nullable = true)
 |-- language: string

Prints datatypes

In [4]:
df.dtypes

[('_links',
  'struct<nextepisode:struct<href:string>,previousepisode:struct<href:string>,self:struct<href:string>>'),
 ('averageRuntime', 'bigint'),
 ('dvdCountry', 'struct<code:string,name:string,timezone:string>'),
 ('ended', 'string'),
 ('externals', 'struct<imdb:string,thetvdb:bigint,tvrage:bigint>'),
 ('genres', 'array<string>'),
 ('id', 'bigint'),
 ('image', 'struct<medium:string,original:string>'),
 ('language', 'string'),
 ('name', 'string'),
 ('network',
  'struct<country:struct<code:string,name:string,timezone:string>,id:bigint,name:string>'),
 ('officialSite', 'string'),
 ('premiered', 'string'),
 ('rating', 'struct<average:double>'),
 ('runtime', 'bigint'),
 ('schedule', 'struct<days:array<string>,time:string>'),
 ('status', 'string'),
 ('summary', 'string'),
 ('type', 'string'),
 ('updated', 'bigint'),
 ('url', 'string'),
 ('webChannel',
  'struct<country:struct<code:string,name:string,timezone:string>,id:bigint,name:string>'),
 ('weight', 'bigint')]

Identifies number of nulls per column

In [5]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+------+--------------+----------+-----+---------+------+---+-----+--------+----+-------+------------+---------+------+-------+--------+------+-------+----+-------+---+----------+------+
|_links|averageRuntime|dvdCountry|ended|externals|genres| id|image|language|name|network|officialSite|premiered|rating|runtime|schedule|status|summary|type|updated|url|webChannel|weight|
+------+--------------+----------+-----+---------+------+---+-----+--------+----+-------+------------+---------+------+-------+--------+------+-------+----+-------+---+----------+------+
|     0|          3549|     55328|14216|        0|     0|  0| 6757|     360|   0|   8083|       21204|     2522|     0|   4774|       0|     0|   6952|   0|      0|  0|     46872|     0|
+------+--------------+----------+-----+---------+------+---+-----+--------+----+-------+------------+---------+------+-------+--------+------+-------+----+-------+---+----------+------+



In [6]:
numeric_cols = [c for c,t in df.dtypes if t in ('bigint')]
string_cols = [c for c,t in df.dtypes if t in ('string')]
df[numeric_cols].select([count(when(col(c).isNull(), c)).alias(c) for c in df[numeric_cols].columns]).show()
df[string_cols].select([count(when(col(c).isNull(), c)).alias(c) for c in df[string_cols].columns]).show()

+--------------+---+-------+-------+------+
|averageRuntime| id|runtime|updated|weight|
+--------------+---+-------+-------+------+
|          3549|  0|   4774|      0|     0|
+--------------+---+-------+-------+------+

+-----+--------+----+------------+---------+------+-------+----+---+
|ended|language|name|officialSite|premiered|status|summary|type|url|
+-----+--------+----+------------+---------+------+-------+----+---+
|14216|     360|   0|       21204|     2522|     0|   6952|   0|  0|
+-----+--------+----+------------+---------+------+-------+----+---+



Learning how to use regexp_replace

In [7]:
# df = df.withColumn("summary",(regexp_replace("summary","<p>", "")))\
#     .withColumn("summary",(regexp_replace("summary","</p>", "")))\
#     .withColumn("summary",(regexp_replace("summary","<b>", "")))\
#     .withColumn("summary",(regexp_replace("summary","</b>", "")))\
#     .withColumn("summary",(regexp_replace("summary","<i>", "")))\
#     .withColumn("summary",(regexp_replace("summary","</i>", "")))\
#     .withColumn("summary",(regexp_replace("summary","&amp;", "&")))
    
# df.select('summary').show(20, truncate=False)

Enables me to see a snippet of data for each column

In [8]:
df.columns

for column in df.columns:
    df.select(column).show(5, truncate=False)

+-------------------------------------------------------------------------------------+
|_links                                                                               |
+-------------------------------------------------------------------------------------+
|{null, {https://api.tvmaze.com/episodes/1051658}, {https://api.tvmaze.com/shows/250}}|
|{null, {https://api.tvmaze.com/episodes/623237}, {https://api.tvmaze.com/shows/251}} |
|{null, {https://api.tvmaze.com/episodes/1011244}, {https://api.tvmaze.com/shows/252}}|
|{null, {https://api.tvmaze.com/episodes/2118484}, {https://api.tvmaze.com/shows/253}}|
|{null, {https://api.tvmaze.com/episodes/1684225}, {https://api.tvmaze.com/shows/254}}|
+-------------------------------------------------------------------------------------+
only showing top 5 rows

+--------------+
|averageRuntime|
+--------------+
|30            |
|71            |
|30            |
|60            |
|65            |
+--------------+
only showing top 5 rows

+----

Drops the columns I don't need

In [9]:
df = df.drop('_links', 'weight', 'url', 'officialSite', 'externals', 'image', 'summary', 'updated', 'webChannel', 'id', 'dvdCountry')

Drops the rows where the rating is null

In [10]:
df = df.dropna(subset=['rating.average'], how='any')
df.select([count(when(col('rating.average').isNull(),True))]).show()

+-------------------------------------------------------+
|count(CASE WHEN (rating.average IS NULL) THEN true END)|
+-------------------------------------------------------+
|                                                      0|
+-------------------------------------------------------+



This re-assigns the rating column from being a structType, to a doubleType, with no nested content

In [11]:
df = df.withColumn('rating', df.rating.average)

Most popular language for tv shows

In [12]:
df.groupBy('language').agg(count('language')).orderBy('count(language)', ascending=True).show()

+-------------+---------------+
|     language|count(language)|
+-------------+---------------+
|         null|              0|
|   Vietnamese|              1|
|    Hungarian|              1|
|        Irish|              1|
|Luxembourgish|              1|
|      Catalan|              1|
|     Romanian|              1|
|      Serbian|              1|
|        Latin|              1|
|       Arabic|              2|
|      Tagalog|              2|
|    Ukrainian|              2|
|    Icelandic|              3|
|        Welsh|              3|
|      Finnish|              4|
|       Polish|              8|
|       Hebrew|              8|
|        Hindi|              9|
|   Portuguese|             11|
|      Italian|             12|
+-------------+---------------+
only showing top 20 rows



Average rating per language.

In [13]:
df.groupBy('language').agg(round(avg('rating'), 1).alias('average rating')).orderBy('average rating', ascending=False).show()

+-------------+--------------+
|     language|average rating|
+-------------+--------------+
|      Serbian|           8.8|
|    Ukrainian|           8.6|
|        Irish|           8.4|
|    Hungarian|           8.4|
|      Catalan|           8.3|
|     Romanian|           8.3|
|   Vietnamese|           8.0|
|Luxembourgish|           8.0|
|         Thai|           7.8|
|       Korean|           7.8|
|       Hebrew|           7.7|
|    Norwegian|           7.6|
|        Welsh|           7.6|
|       French|           7.5|
|       Danish|           7.5|
|      Chinese|           7.4|
|      Swedish|           7.4|
|      Finnish|           7.4|
|      Turkish|           7.3|
|     Japanese|           7.3|
+-------------+--------------+
only showing top 20 rows



Average rating per language, taking into account languages with bias.\
For example the language Serbian only has one show rating, so the mean for that language is skewed.

In [14]:
df.groupBy('language')\
    .agg(round(avg('rating'), 1).alias('average rating'))\
    .orderBy('average rating', ascending=False)\
    .filter(('language != "Vietnamese" and language != "Hungarian" and language != "Irish" and language != "Luxembourgish" and language != "Catalan" and language != "Romanian" and language != "Serbian" and language != "Latin" and language != "Arabic" and language != "Tagalog" and language != "Ukrainian" and language != "Icelandic" and language != "Welsh" and language != "Finnish"'))\
    .show()

+----------+--------------+
|  language|average rating|
+----------+--------------+
|      Thai|           7.8|
|    Korean|           7.8|
|    Hebrew|           7.7|
| Norwegian|           7.6|
|    Danish|           7.5|
|    French|           7.5|
|   Chinese|           7.4|
|   Swedish|           7.4|
|   Russian|           7.3|
|    German|           7.3|
|   Turkish|           7.3|
|  Japanese|           7.3|
|     Dutch|           7.3|
|   Italian|           7.2|
|    Polish|           7.2|
|   English|           7.1|
|   Spanish|           6.7|
|     Hindi|           6.5|
|Portuguese|           6.1|
+----------+--------------+



Top show for each language based on rating

In [15]:
a = df.groupBy('name', 'language').agg(max('rating')).orderBy('max(rating)', ascending=False)

In [16]:
lang = []
for element in a.select('language').distinct().collect():
    lang.append(element[0])

for l in lang:
    rows = a.select('name', 'language').filter(f"language = '{l}'").collect()
    for row in rows:
        print(f"Show: {row.asDict()['name']}\nLanguage: {row.asDict()['language']}\n")
        break

Show: Aşk 101
Language: Turkish

Show: Romulus
Language: Latin

Show: Sorjonen
Language: Finnish

Show: En of Love: Love Mechanics
Language: Thai

Show: Ófærð
Language: Icelandic

Show: A Love So Beautiful
Language: Chinese

Show: #@)₴?$0 з Майклом Щуром
Language: Ukrainian

Show: Red Rock
Language: Irish

Show: Sexify
Language: Polish

Show: Gaya Sa Pelikula
Language: Tagalog

Show: Paradise Kiss
Language: Japanese

Show: Critical Role
Language: English

Show: El Internado
Language: Spanish

Show: You Are Ma Boy
Language: Vietnamese

Show: Hostages
Language: Hebrew

Show: Senke nad Balkanom
Language: Serbian

Show: Bom dia, Verônica
Language: Portuguese

Show: Aranyélet
Language: Hungarian

Show: Il commissario Montalbano
Language: Italian

Show: Capitani
Language: Luxembourgish

Show: 1864
Language: Danish

Show: Merlí
Language: Catalan

Show: Umbre
Language: Romanian

Show: Шифр
Language: Russian

Show: Navillera
Language: Korean

Show: Leila
Language: Hindi

Show: Les Témoins
Langu