In [None]:
from pyspark.sql import SparkSession
from delta import *
import pyspark.sql.functions as F

builder = SparkSession.builder.appName("Delta") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    .config("spark.sql.catalogImplementation","hive")\
    .config("spark.sql.hive.thriftServer.singleSession","false")

spark = configure_spark_with_delta_pip(builder).enableHiveSupport() \
    .getOrCreate()


In [None]:
raw_df = spark.read.format("csv").option("header","true").load("./movies.csv")

In [None]:
raw_df.printSchema()

In [None]:
raw_df.show()

In [None]:
raw_df = raw_df.withColumn("Year", F.col("Year").cast("int"))
raw_df = raw_df.withColumn("Rotten Tomatoes %", F.col("Rotten Tomatoes %").cast("int"))
raw_df = raw_df.withColumn("Profitability", F.col("Profitability").cast("int"))
raw_df = raw_df.withColumn("Audience score %", F.col("Audience score %").cast("int"))

In [None]:
raw_df = raw_df.withColumnRenamed("Film","film")
raw_df = raw_df.withColumnRenamed("Genre","genre")
raw_df = raw_df.withColumnRenamed("Lead Studio","lead_studio")
raw_df = raw_df.withColumnRenamed("Audience score %","audience_score")
raw_df = raw_df.withColumnRenamed("Rotten Tomatoes %","rotten_tomatoes")
raw_df = raw_df.withColumnRenamed("Worldwide Gross","worldwide_gross")
raw_df = raw_df.withColumnRenamed("Year","year")
raw_df = raw_df.withColumnRenamed("Profitability","profitability")

In [None]:
raw_df.printSchema()

In [None]:
spark.sql("CREATE SCHEMA delta_db LOCATION 'hdfs://namenode:8020/user/hive/warehouse/deltalake';")

In [None]:
spark.sql("""
CREATE TABLE delta_db.movie(
        film string,
        genre string,
        lead_studio string,
        audience_score int,
        profitability int,
        rotten_tomatoes int,
        worldwide_gross string,
        year int
        )
USING delta 
PARTITIONED BY (year)
LOCATION 'hdfs://namenode:8020/user/hive/warehouse/deltalake/movie';
""")

In [None]:
spark.sql("USE delta_db")
spark.sql("SHOW TABLES").show()

In [None]:
high_df.write.format("delta").mode("append").saveAsTable("delta_db.movie")