In [4]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
import os

master = "spark://zy-ubuntu:7077"  
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--master {master} --driver-memory 4g --total-executor-cores 6 --executor-memory 8g --packages org.postgresql:postgresql:42.1.1 pyspark-shell'

In [5]:
spark = SparkSession.builder \
    .appName("title crew") \
    .getOrCreate()

# spark.sparkContext.getConf().getAll()

In [6]:
title_principals_df = spark.read.csv("title.principals.tsv", sep=r'\t', header=True)
title_principals_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- ordering: string (nullable = true)
 |-- nconst: string (nullable = true)
 |-- category: string (nullable = true)
 |-- job: string (nullable = true)
 |-- characters: string (nullable = true)



In [7]:
title_principals_df.show()

+---------+--------+---------+---------------+--------------------+--------------+
|   tconst|ordering|   nconst|       category|                 job|    characters|
+---------+--------+---------+---------------+--------------------+--------------+
|tt0000001|       1|nm1588970|           self|                  \N|      ["Self"]|
|tt0000001|       2|nm0005690|       director|                  \N|            \N|
|tt0000001|       3|nm0374658|cinematographer|director of photo...|            \N|
|tt0000002|       1|nm0721526|       director|                  \N|            \N|
|tt0000002|       2|nm1335271|       composer|                  \N|            \N|
|tt0000003|       1|nm0721526|       director|                  \N|            \N|
|tt0000003|       2|nm1770680|       producer|            producer|            \N|
|tt0000003|       3|nm1335271|       composer|                  \N|            \N|
|tt0000003|       4|nm5442200|         editor|                  \N|            \N|
|tt0

In [23]:
title_crew_df = title_principals_df.filter(
        (F.col('category') != 'self') &
        (F.col('category') != 'actor') & 
        (F.col('category') != 'actress'))

title_crew_df = title_crew_df.drop('job').drop('characters').drop('ordering')
title_crew_df = title_crew_df.withColumnRenamed('category', 'role')
title_crew_df.select('role').distinct().show()     

title_crew_df.show()

+-------------------+
|               role|
+-------------------+
|           producer|
|             writer|
|           composer|
|           director|
|             editor|
|    cinematographer|
|      archive_sound|
|production_designer|
|    archive_footage|
+-------------------+

+---------+---------+---------------+
|   tconst|   nconst|           role|
+---------+---------+---------------+
|tt0000001|nm0005690|       director|
|tt0000001|nm0374658|cinematographer|
|tt0000002|nm0721526|       director|
|tt0000002|nm1335271|       composer|
|tt0000003|nm0721526|       director|
|tt0000003|nm1770680|       producer|
|tt0000003|nm1335271|       composer|
|tt0000003|nm5442200|         editor|
|tt0000004|nm0721526|       director|
|tt0000004|nm1335271|       composer|
|tt0000005|nm0005690|       director|
|tt0000005|nm0249379|       producer|
|tt0000006|nm0005690|       director|
|tt0000007|nm0005690|       director|
|tt0000007|nm0374658|       director|
|tt0000007|nm0249379|       p

In [21]:
# read tsv file into df
name_basics_df = spark.read.csv("name.basics.tsv", sep=r'\t', header=True)

# rename column
name_basics_df = name_basics_df.withColumnRenamed('primaryName', 'name')

# calculate age from birth year and death year
name_basics_df = name_basics_df.withColumn('age', ( F.col('deathYear').cast(IntegerType()) - F.col('birthYear').cast(IntegerType()) ))

# create is_alive column based on conditions
is_alive_col = F.when(
    (F.col("birthYear") != '\\N') & (F.col("deathYear") != '\\N'), False
).when((F.col("birthYear") != '\\N') & (F.col("deathYear") == '\\N'), True).otherwise(None)

name_basics_df = name_basics_df.withColumn('is_alive', is_alive_col)

# drop unused columns
name_basics_df_dropped = name_basics_df.drop('primaryProfession', 'knownForTitles', 'birthYear', 'deathYear')

In [24]:
df_crew = title_crew_df.join(name_basics_df_dropped, ['nconst'])
df_crew.show()

+---------+----------+---------------+--------------+---+--------+
|   nconst|    tconst|           role|          name|age|is_alive|
+---------+----------+---------------+--------------+---+--------+
|nm0000086| tt0741798|archive_footage|Louis de Funès| 69|   false|
|nm0000086| tt0804908|archive_footage|Louis de Funès| 69|   false|
|nm0000086| tt1032923|archive_footage|Louis de Funès| 69|   false|
|nm0000086|tt10424852|archive_footage|Louis de Funès| 69|   false|
|nm0000086|tt10883038|archive_footage|Louis de Funès| 69|   false|
|nm0000086|tt11739456|archive_footage|Louis de Funès| 69|   false|
|nm0000086|tt11935190|archive_footage|Louis de Funès| 69|   false|
|nm0000086|tt12069126|archive_footage|Louis de Funès| 69|   false|
|nm0000086|tt12141344|archive_footage|Louis de Funès| 69|   false|
|nm0000086|tt12206554|archive_footage|Louis de Funès| 69|   false|
|nm0000086|tt12600988|archive_footage|Louis de Funès| 69|   false|
|nm0000086|tt12808176|archive_footage|Louis de Funès| 69|   fa

In [26]:
df_crew.write.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='dim_crew',
      user='admin',
      password='password'
      ).mode('append').save()