In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, DecimalType
import os

master = "spark://zy-ubuntu:7077"  
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--master {master} --driver-memory 4g --total-executor-cores 6 --executor-memory 8g --packages org.postgresql:postgresql:42.1.1 pyspark-shell'

In [2]:
spark = SparkSession.builder \
    .appName("title ratings") \
    .getOrCreate()

In [3]:
title_ratings_df = spark.read.csv("title.ratings.tsv", sep=r'\t', header=True)
title_ratings_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)



In [4]:
title_ratings_df.show()

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1707|
|tt0000002|          6.1|     210|
|tt0000003|          6.5|    1466|
|tt0000004|          6.2|     123|
|tt0000005|          6.2|    2267|
|tt0000006|          5.1|     127|
|tt0000007|          5.5|     691|
|tt0000008|          5.4|    1878|
|tt0000009|          5.9|     155|
|tt0000010|          6.9|    6346|
|tt0000011|          5.2|     282|
|tt0000012|          7.4|   10906|
|tt0000013|          5.8|    1635|
|tt0000014|          7.1|    4861|
|tt0000015|          6.2|     884|
|tt0000016|          5.9|    1262|
|tt0000017|          4.6|     257|
|tt0000018|          5.4|     508|
|tt0000019|          5.3|      19|
|tt0000020|          4.9|     282|
+---------+-------------+--------+
only showing top 20 rows



In [5]:
title_ratings_df = title_ratings_df.withColumnRenamed("averageRating", 'av_rating').withColumnRenamed('numVotes', 'num_votes')
title_ratings_df = title_ratings_df.withColumn('av_rating', F.col('av_rating').cast(DecimalType(10, 1)))
title_ratings_df = title_ratings_df.withColumn('num_votes', F.col('num_votes').cast(IntegerType()))
title_ratings_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- av_rating: decimal(10,1) (nullable = true)
 |-- num_votes: integer (nullable = true)



In [6]:
title_ratings_df.show()

+---------+---------+---------+
|   tconst|av_rating|num_votes|
+---------+---------+---------+
|tt0000001|      5.7|     1707|
|tt0000002|      6.1|      210|
|tt0000003|      6.5|     1466|
|tt0000004|      6.2|      123|
|tt0000005|      6.2|     2267|
|tt0000006|      5.1|      127|
|tt0000007|      5.5|      691|
|tt0000008|      5.4|     1878|
|tt0000009|      5.9|      155|
|tt0000010|      6.9|     6346|
|tt0000011|      5.2|      282|
|tt0000012|      7.4|    10906|
|tt0000013|      5.8|     1635|
|tt0000014|      7.1|     4861|
|tt0000015|      6.2|      884|
|tt0000016|      5.9|     1262|
|tt0000017|      4.6|      257|
|tt0000018|      5.4|      508|
|tt0000019|      5.3|       19|
|tt0000020|      4.9|      282|
+---------+---------+---------+
only showing top 20 rows



In [15]:
title_ratings_df.write.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='dim_ratings',
      user='admin',
      password='password'
      ).mode('append').save()