In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType
import datetime
import os

master = "spark://zy-ubuntu:7077"  
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--master {master} --driver-memory 4g --total-executor-cores 6 --executor-memory 8g --packages org.postgresql:postgresql:42.1.1 pyspark-shell'

In [2]:
spark = SparkSession.builder \
    .appName("title principals") \
    .getOrCreate()

In [8]:
title_principals_df = spark.read.csv("title.principals.tsv", sep=r'\t', header=True)
title_principals_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- ordering: string (nullable = true)
 |-- nconst: string (nullable = true)
 |-- category: string (nullable = true)
 |-- job: string (nullable = true)
 |-- characters: string (nullable = true)



In [9]:
title_principals_df = title_principals_df.drop('ordering').drop('job').drop('category')

title_principals_df = title_principals_df.withColumn("characters", F.regexp_replace(F.col("characters"), '[\[\]\"]', "").alias("replaced"))
title_principals_df = title_principals_df.withColumn('characters', F.explode(F.split('characters', ',')))
title_principals_df = title_principals_df.withColumnRenamed('characters', 'character')
title_casts_df = title_principals_df.filter(
        (F.col('category') == 'self') | 
        (F.col('category') == 'actor') | 
        (F.col('category') == 'actress'))
title_casts_df.show(truncate=False)   

+---------+---------+--------------------------------------------------------------+
|tconst   |nconst   |character                                                     |
+---------+---------+--------------------------------------------------------------+
|tt0000001|nm1588970|Self                                                          |
|tt0000005|nm0443482|Blacksmith                                                    |
|tt0000005|nm0653042|Assistant                                                     |
|tt0000007|nm0179163|\N                                                            |
|tt0000007|nm0183947|\N                                                            |
|tt0000008|nm0653028|Sneezing Man                                                  |
|tt0000009|nm0063086|Miss Geraldine Holbrook (Miss Jerry)                          |
|tt0000009|nm0183823|Mr. Hamilton                                                  |
|tt0000009|nm1309758|Chauncey Depew - the Director of the New Yor

In [10]:
# read tsv file into df
name_basics_df = spark.read.csv("name.basics.tsv", sep=r'\t', header=True)

# rename column
name_basics_df = name_basics_df.withColumnRenamed('primaryName', 'name')

# calculate age from birth year and death year
name_basics_df = name_basics_df.withColumn('age', ( F.col('deathYear').cast(IntegerType()) - F.col('birthYear').cast(IntegerType()) ))

# create is_alive column based on conditions
is_alive_col = F.when(
    (F.col("birthYear") != '\\N') & (F.col("deathYear") != '\\N'), False
).when((F.col("birthYear") != '\\N') & (F.col("deathYear") == '\\N'), True).otherwise(None)

name_basics_df = name_basics_df.withColumn('is_alive', is_alive_col)

# drop unused columns
name_basics_df_dropped = name_basics_df.drop('primaryProfession', 'knownForTitles', 'birthYear', 'deathYear')

In [18]:
df_casts = title_casts_df.join(name_basics_df_dropped, ['nconst'])
df_casts = df_casts.withColumn('character', F.when(F.col('character') == '\\N', F.lit(None)).otherwise(F.col('character')))
df_casts.show()

+---------+---------+--------------------+--------------+---+--------+
|   nconst|   tconst|           character|          name|age|is_alive|
+---------+---------+--------------------+--------------+---+--------+
|nm0000086|tt0043398|                null|Louis de Funès| 69|   false|
|nm0000086|tt0043691|    Piotr Petrovitch|Louis de Funès| 69|   false|
|nm0000086|tt0045293|               Emile|Louis de Funès| 69|   false|
|nm0000086|tt0045293| le valet de chambre|Louis de Funès| 69|   false|
|nm0000086|tt0046243|       Le professeur|Louis de Funès| 69|   false|
|nm0000086|tt0046453|        Eddy Gorlier|Louis de Funès| 69|   false|
|nm0000086|tt0047462|          Boulingrin|Louis de Funès| 69|   false|
|nm0000086|tt0048994|Célestin Ratier -...|Louis de Funès| 69|   false|
|nm0000086|tt0049877|             Jambier|Louis de Funès| 69|   false|
|nm0000086|tt0049877|           l'épicier|Louis de Funès| 69|   false|
|nm0000086|tt0050260|       Pierre Cousin|Louis de Funès| 69|   false|
|nm000

In [20]:
# insert df into dim_casts table
df_casts.write.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='dim_casts',
      user='admin',
      password='password'
      ).mode('append').save()