In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType
import datetime
import os

master = "spark://zy-ubuntu:7077"  
# os.environ['PYSPARK_SUBMIT_ARGS'] = f'--master local[*] --driver-memory 6g --executor-memory 6g --packages org.postgresql:postgresql:42.1.1 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--master {master} --driver-memory 4g --total-executor-cores 6 --executor-memory 8g --packages org.postgresql:postgresql:42.1.1 pyspark-shell'

In [2]:
spark = SparkSession.builder \
    .appName("ok") \
    .getOrCreate()

In [3]:
# create a df with year, month, day with today's date
now = datetime.datetime.now()

data = [(now.year,now.month,now.day)]

schema = StructType([ \
    StructField("year",IntegerType(),True), \
    StructField("month",IntegerType(),True), \
    StructField("day",IntegerType(),True), \
  ])
 
dl_date_df = spark.createDataFrame(data=data,schema=schema)
dl_date_df.show()

+----+-----+---+
|year|month|day|
+----+-----+---+
|2021|    6|  7|
+----+-----+---+



In [4]:
# insert dataset download date into db
dl_date_df.write.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='dim_download_date',
      user='admin',
      password='password'
      ).mode('append').save()

In [4]:
# read tsv file into df
title_crew_df = spark.read.csv("title.crew.tsv", sep=r'\t', header=True)

In [5]:
# split the comma separated values in directors and writers column
title_crew_df = title_crew_df.withColumn('directors', F.explode(F.split('directors', ',')))
title_crew_df = title_crew_df.withColumn('writers', F.explode(F.split('writers', ',')))

# create nconst and is_writer columns from writers column
df_writers = title_crew_df.withColumn('nconst', F.when(F.col('writers') != '\\N', F.col("writers"))) \
    .withColumn('is_writer', F.when(F.col('writers') != '\\N', F.lit(True)).otherwise(F.lit(False))).drop('writers') \
    .filter(F.col('nconst').isNotNull()).drop('directors')

# create nconst and is_director columns from directors column
df_directors = title_crew_df.withColumn('nconst', F.when(F.col('directors') != '\\N', F.col("directors"))) \
    .withColumn('is_director', F.when(F.col('directors') != '\\N', F.lit(True)).otherwise(F.lit(False))).drop('directors') \
    .filter(F.col('nconst').isNotNull()).drop('writers')

In [6]:
# do full outer join 
df_joined = df_writers.join(df_directors, ['nconst','tconst'], how='full')

In [9]:
# dedupe after full outer join (maybe need to fix this)
df_joined_deduped = df_joined.drop_duplicates(subset=['nconst', 'tconst'])

In [10]:
# read tsv file into df
name_basics_df = spark.read.csv("name.basics.tsv", sep=r'\t', header=True)

# rename column
name_basics_df = name_basics_df.withColumnRenamed('primaryName', 'name')

# calculate age from birth year and death year
name_basics_df = name_basics_df.withColumn('age', ( F.col('deathYear').cast(IntegerType()) - F.col('birthYear').cast(IntegerType()) ))

# create is_alive column based on conditions
is_alive_col = F.when(
    (F.col("birthYear") != '\\N') & (F.col("deathYear") != '\\N'), False
).when((F.col("birthYear") != '\\N') & (F.col("deathYear") == '\\N'), True).otherwise(None)

name_basics_df = name_basics_df.withColumn('is_alive', is_alive_col)

# drop unused columns
name_basics_df_dropped = name_basics_df.drop('primaryProfession', 'knownForTitles', 'birthYear', 'deathYear')

In [11]:
# inner join on the two processed dfs
df_final = df_joined_deduped.join(name_basics_df_dropped, ['nconst'])

# fill nulls with false 
df_final = df_final.fillna({'is_writer': False, 'is_director': False})

In [11]:
# insert df into dim_crew table
df_final.write.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='dim_crew',
      user='admin',
      password='password'
      ).mode('append').save()