In [38]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import when
from pyspark.sql.types import StructType, StructField, ShortType, StringType
import datetime
import os

master = "spark://zy-ubuntu:7077"  
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--master {master} --driver-memory 4g --total-executor-cores 8 --executor-memory 8g --packages org.postgresql:postgresql:42.1.1 pyspark-shell'

In [2]:
spark = SparkSession.builder \
    .appName("dim episodes") \
    .getOrCreate()

In [32]:
title_episodes_df = spark.read.csv("title.episode.tsv", sep=r'\t', header=True)
title_episodes_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- parentTconst: string (nullable = true)
 |-- seasonNumber: string (nullable = true)
 |-- episodeNumber: string (nullable = true)



In [33]:
title_episodes_df = title_episodes_df.withColumn('seasonNumber', F.col('seasonNumber').cast(ShortType()))
title_episodes_df = title_episodes_df.withColumn('episodeNumber', F.col('episodeNumber').cast(ShortType()))
title_episodes_df = title_episodes_df.drop('tconst')
title_episodes_df.sort('parentTconst', F.col('seasonNumber').desc(), F.col('episodeNumber').desc()).show()

+------------+------------+-------------+
|parentTconst|seasonNumber|episodeNumber|
+------------+------------+-------------+
|   tt0032557|           1|           15|
|   tt0032557|           1|           14|
|   tt0032557|           1|           13|
|   tt0032557|           1|           12|
|   tt0032557|           1|           11|
|   tt0032557|           1|           10|
|   tt0032557|           1|            9|
|   tt0032557|           1|            8|
|   tt0032557|           1|            7|
|   tt0032557|           1|            6|
|   tt0032557|           1|            5|
|   tt0032557|           1|            4|
|   tt0032557|           1|            3|
|   tt0032557|           1|            2|
|   tt0032557|           1|            1|
|   tt0038276|           1|            1|
|   tt0038276|        null|         null|
|   tt0038276|        null|         null|
|   tt0038276|        null|         null|
|   tt0038276|        null|         null|
+------------+------------+-------

In [34]:
# get the list of unique parent tconst in the df
parentTconst_list = [row.parentTconst for row in title_episodes_df.select('parentTconst').distinct().collect()]

In [84]:
tmp = title_episodes_df.filter(F.col('parentTconst') == 'tt0168358').sort('parentTconst', F.col('seasonNumber').desc(), F.col('episodeNumber').desc())

season = tmp.collect()[0][1]

output = []
output_columns = ["parent_tconst", "season", 'total_episodes']
while True:
    season = tmp.filter(F.col('seasonNumber') == season).collect()[0][1]
    ep_count = tmp.filter(F.col('seasonNumber') == season).collect()[0][2]
    output.append((parentTconst_list[0], season, ep_count))
    if season == 1 or season == None:
        break
    season -= 1

output_df = spark.createDataFrame(data=output, schema=output_columns)

IndexError: list index out of range

In [85]:
tmp1 = title_episodes_df.filter(F.col('parentTconst') == 'tt0168358').sort('parentTconst', F.col('seasonNumber').desc(), F.col('episodeNumber').desc())
tmp1.show()


+------------+------------+-------------+
|parentTconst|seasonNumber|episodeNumber|
+------------+------------+-------------+
|   tt0168358|          26|           26|
|   tt0168358|          26|            3|
|   tt0168358|          26|            2|
|   tt0168358|          19|            1|
|   tt0168358|          18|            1|
|   tt0168358|          17|            1|
|   tt0168358|          16|            1|
|   tt0168358|          15|            1|
|   tt0168358|          14|            1|
|   tt0168358|          13|           10|
|   tt0168358|          13|            1|
|   tt0168358|          12|            9|
|   tt0168358|          12|            1|
|   tt0168358|          11|           52|
|   tt0168358|          11|           51|
|   tt0168358|          11|           50|
|   tt0168358|          11|           49|
|   tt0168358|          11|           48|
|   tt0168358|          11|           47|
|   tt0168358|          11|           46|
+------------+------------+-------

In [89]:
def process_episodes(title_episodes_df, parent_tconst):
    tmp = title_episodes_df.filter(F.col('parentTconst') == parent_tconst).sort('parentTconst', F.col('seasonNumber').desc(), F.col('episodeNumber').desc())
    season = tmp.collect()[0][1]  # check if season is a number or null

    output = []
    schema = StructType([
        StructField("parent_tconst", StringType(), False), 
        StructField("season", ShortType()), 
        StructField("total_episodes", ShortType())
    ])

    if season is None:  # if season is none, then episode = number of occurences
        output.append((parent_tconst, season, tmp.count()))
    else:
        while True:
            season = tmp.filter(F.col('seasonNumber') == season).collect()[0][1]
            ep_count = tmp.filter(F.col('seasonNumber') == season).collect()[0][2]
            output.append((parent_tconst, season, ep_count))

            tmp = tmp.filter(F.col('seasonNumber') != season)  # drop the processed season number rows
            if tmp.count() < 1:  # there is no more rows to process
                break
            season = tmp.collect()[0][1]
    return spark.createDataFrame(data=output, schema=schema)

In [90]:
schema = StructType([
    StructField("parent_tconst", StringType(), False), StructField("season", ShortType()), StructField("total_episodes", ShortType())
])

df_final = spark.createDataFrame([], schema)

In [91]:
for tconst in parentTconst_list:
    each_df = process_episodes(title_episodes_df, tconst)
    df_final = df_final.union(each_df)

IndexError: list index out of range

In [None]:
df_final.show()