In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, ShortType, DecimalType
import datetime
import os

master = "spark://zy-ubuntu:7077"  
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--master {master} --driver-memory 4g --total-executor-cores 6 --executor-memory 8g --packages org.postgresql:postgresql:42.1.1 pyspark-shell'

In [2]:
spark = SparkSession.builder \
    .appName("title desc") \
    .getOrCreate()

In [3]:
title_basics_df = spark.read.csv("title.basics.tsv", sep=r'\t', header=True)
title_basics_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [4]:
genres_split = F.split(title_basics_df['genres'], ',')
title_basics_df = title_basics_df.withColumn('genre_1', genres_split.getItem(0))
title_basics_df = title_basics_df.withColumn('genre_2', genres_split.getItem(1))
title_basics_df = title_basics_df.withColumn('genre_3', genres_split.getItem(2))

title_basics_df = title_basics_df.withColumnRenamed('titleType', 'type') \
    .withColumnRenamed('primaryTitle', 'primary_title') \
    .withColumnRenamed('originalTitle', 'original_title') \
    .withColumnRenamed('isAdult', 'is_adult') \
    .withColumnRenamed('startYear', 'start_year') \
    .withColumnRenamed('endYear', 'end_year') \
    .withColumnRenamed('runtimeMinutes', 'runtime_minutes') \
    .drop('genres')

title_basics_df = title_basics_df.withColumn('end_year', F.when(F.col('end_year') == '\\N', F.lit(None)).otherwise(F.col('end_year'))) 
title_basics_df = title_basics_df.withColumn('runtime_minutes', F.when(F.col('runtime_minutes') == '\\N', F.lit(None)).otherwise(F.col('runtime_minutes'))) 
title_basics_df = title_basics_df.withColumn('is_adult', F.when(F.col('is_adult') == '0', F.lit(False)).otherwise(F.lit(True)))   

title_basics_df = title_basics_df.withColumn('start_year', F.col('start_year').cast(ShortType()))
title_basics_df = title_basics_df.withColumn('end_year', F.col('end_year').cast(ShortType()))
title_basics_df = title_basics_df.withColumn('runtime_minutes', F.col('runtime_minutes').cast(ShortType()))

# title_basics_df.show()

In [68]:
title_basics_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- type: string (nullable = true)
 |-- primary_title: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- is_adult: boolean (nullable = false)
 |-- start_year: short (nullable = true)
 |-- end_year: short (nullable = true)
 |-- runtime_minutes: short (nullable = true)
 |-- genre_1: string (nullable = true)
 |-- genre_2: string (nullable = true)
 |-- genre_3: string (nullable = true)



In [5]:
title_ratings_df = spark.read.csv("title.ratings.tsv", sep=r'\t', header=True)
title_ratings_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)



In [6]:
title_ratings_df = title_ratings_df.withColumnRenamed("averageRating", 'av_rating').withColumnRenamed('numVotes', 'num_votes')
title_ratings_df = title_ratings_df.withColumn('av_rating', F.col('av_rating').cast(DecimalType(10, 1)))
title_ratings_df = title_ratings_df.withColumn('num_votes', F.col('num_votes').cast(IntegerType()))
title_ratings_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- av_rating: decimal(10,1) (nullable = true)
 |-- num_votes: integer (nullable = true)



In [7]:
df_final = title_basics_df.join(title_ratings_df, ['tconst'])
# df_final.show()

In [72]:
df_final.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- type: string (nullable = true)
 |-- primary_title: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- is_adult: boolean (nullable = false)
 |-- start_year: short (nullable = true)
 |-- end_year: short (nullable = true)
 |-- runtime_minutes: short (nullable = true)
 |-- genre_1: string (nullable = true)
 |-- genre_2: string (nullable = true)
 |-- genre_3: string (nullable = true)
 |-- av_rating: decimal(10,1) (nullable = true)
 |-- num_votes: integer (nullable = true)



In [8]:
def set_df_columns_nullable(spark, df, column_list, nullable=True):
    for struct_field in df.schema:
        if struct_field.name in column_list:
            struct_field.nullable = nullable
    df_mod = spark.createDataFrame(df.rdd, df.schema)
    return df_mod

df_final = set_df_columns_nullable(spark, df_final, ['tconst','primary_title', 'original_title'], False)
df_final.printSchema()

root
 |-- tconst: string (nullable = false)
 |-- type: string (nullable = true)
 |-- primary_title: string (nullable = false)
 |-- original_title: string (nullable = false)
 |-- is_adult: boolean (nullable = false)
 |-- start_year: short (nullable = true)
 |-- end_year: short (nullable = true)
 |-- runtime_minutes: short (nullable = true)
 |-- genre_1: string (nullable = true)
 |-- genre_2: string (nullable = true)
 |-- genre_3: string (nullable = true)
 |-- av_rating: decimal(10,1) (nullable = true)
 |-- num_votes: integer (nullable = true)



In [9]:
# insert df into dim_casts table
df_final.write.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='dim_title_desc',
      user='admin',
      password='password'
      ).mode('append').save()