In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import when
from pyspark.sql.types import IntegerType
import datetime
import os

master = "spark://zy-ubuntu:7077"  
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--master {master} --driver-memory 4g --total-executor-cores 8 --executor-memory 8g --packages org.postgresql:postgresql:42.1.1 pyspark-shell'

In [2]:
spark = SparkSession.builder \
    .appName("crew") \
    .getOrCreate()

# set config to read from minio
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", "admin")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "password")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://192.168.0.188:9000")  # must use IP address
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")
spark.sparkContext._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.sparkContext._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.path.style.access", "true")
spark.sparkContext._jsc.hadoopConfiguration().set("-Dcom.amazonaws.services.s3.enableV4", "true")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.multipart.size", "104857600")   

In [15]:
title_principals_df = spark.read.csv("s3a://imdb/2021-06/06/title.principals.tsv", sep=r'\t', header=True)  
title_principals_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- ordering: string (nullable = true)
 |-- nconst: string (nullable = true)
 |-- category: string (nullable = true)
 |-- job: string (nullable = true)
 |-- characters: string (nullable = true)



In [16]:
title_crew_df = title_principals_df.filter(
        (F.col('category') != 'self') &
        (F.col('category') != 'actor') & 
        (F.col('category') != 'actress'))

title_crew_df = title_crew_df.drop('job').drop('characters').drop('ordering')
title_crew_df = title_crew_df.withColumnRenamed('category', 'role')

In [17]:
# read tsv file into df
name_basics_df = spark.read.csv("s3a://imdb/2021-06/06/name.basics.tsv", sep=r'\t', header=True)  

# rename column
name_basics_df = name_basics_df.withColumnRenamed('primaryName', 'name')

# calculate age from birth year and death year - there will be a scenario where birth year is known but death year is not (there are some ppl a few hundreds y/o)
name_basics_df = name_basics_df.withColumn("age", when((F.col("birthYear") != '\\N') & (F.col("deathYear") != '\\N'), (F.col('deathYear').cast(IntegerType()) - F.col('birthYear').cast(IntegerType()))).when((F.col("birthYear") != '\\N') & (F.col("deathYear") == '\\N'), (datetime.datetime.now().year - F.col('birthYear')).cast(IntegerType())).otherwise(None))

# create is_alive column based on conditions
is_alive_col = F.when(
    (F.col("birthYear") != '\\N') & (F.col("deathYear") != '\\N'), False
).when((F.col("birthYear") != '\\N') & (F.col("deathYear") == '\\N'), True).otherwise(None)

name_basics_df = name_basics_df.withColumn('is_alive', is_alive_col)

# drop unused columns
name_basics_df_dropped = name_basics_df.drop('primaryProfession', 'knownForTitles', 'birthYear', 'deathYear')

In [18]:
df_crew = title_crew_df.join(name_basics_df_dropped, ['nconst'])
# df_crew.show()

In [19]:
df_upload = df_crew.drop('tconst').drop('role').drop_duplicates(['nconst'])

In [20]:
df_upload.write.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='crew',
      user='admin',
      password='password'
      ).mode('append').save()

In [21]:
titles_df = spark.read.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='titles',
      user='admin',
      password='password'
      ).load()

In [22]:
titles_id_df = titles_df.select('id', 'tconst').withColumnRenamed('id', 'title_id')

In [23]:
crew_from_pg_df = spark.read.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='crew',
      user='admin',
      password='password'
      ).load()
crew_from_pg_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: short (nullable = true)
 |-- is_alive: boolean (nullable = true)
 |-- nconst: string (nullable = true)



In [24]:
df_crew.printSchema()
titles_id_df.printSchema()

root
 |-- nconst: string (nullable = true)
 |-- tconst: string (nullable = true)
 |-- role: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- is_alive: boolean (nullable = true)

root
 |-- title_id: integer (nullable = true)
 |-- tconst: string (nullable = true)



In [25]:
crew_id_df = crew_from_pg_df.select('id', 'nconst').withColumnRenamed('id', 'crew_id')
df_crew_composite_table = df_crew.select('nconst', 'tconst', 'role')

In [26]:
df_tmp = crew_id_df.join(df_crew_composite_table, ['nconst'])

In [27]:
df_titles_crew = df_tmp.join(titles_id_df, ['tconst'])
df_titles_crew = df_titles_crew.drop('nconst').drop('tconst')
# df_titles_crew.show()

In [28]:
df_titles_crew.write.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='titles_crew',
      user='admin',
      password='password'
      ).mode('append').save()