In [0]:
%run ./includes/configuration

In [0]:
import json
movies = {"movie": []}
for i in range(8):
    with open(moviePipelinePath + f"movie_{i}.json") as f:
        data = json.load(f)
        movies["movie"].append(data["movie"])

dbutils.fs.put(rawPath, json.dumps(movies, indent=2), True)

In [0]:
display(dbutils.fs.ls(rawPath))

path,name,size,modificationTime
dbfs:/dbfs/FileStore/movie/raw,raw,12795544,1661275260000


In [0]:
dbutils.fs.rm(bronzePath, recurse=True)

In [0]:

kafka_schema = "value STRING"

movie_data_df = (
  spark.read.format("text").schema(kafka_schema).load(rawPath)
  
)

In [0]:
display(movie_data_df)

value
{
"""movie"": ["
[
{
"""Id"": 1,"
"""Title"": ""Inception"","
"""Overview"": ""Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: \""inception\"", the implantation of another person's idea into a target's subconscious."","
"""Tagline"": ""Your mind is the scene of the crime."","
"""Budget"": 160000000.0,"
"""Revenue"": 825532764.0,"


In [0]:
from pyspark.sql.functions import current_timestamp, lit

movie_data_df = (
  movie_data_df.select(
   "value",
lit(f"dbfs:/dbfs/FileStore/movie/raw").alias("datasource"),
    current_timestamp().alias ("ingesttime"),
    lit("new").alias("status"),
    current_timestamp().cast("date").alias("ingestdate")
  )
)

In [0]:
from pyspark.sql.functions import col

(
  movie_data_df.select(
  "datasource",
  "ingesttime",
  "value",
  "status",
  col("ingestdate").alias("p_ingestdate"),
  )
  .write.format("delta")
  .mode("append")
  .partitionBy("p_ingestdate")
  .save(bronzePath)
)

In [0]:
display(dbutils.fs.ls(bronzePath))

path,name,size,modificationTime
dbfs:/dbfs/FileStore/movie/bronze/_delta_log/,_delta_log/,0,1661276013000
dbfs:/dbfs/FileStore/movie/bronze/p_ingestdate=2022-08-23/,p_ingestdate=2022-08-23/,0,1661276002000


In [0]:
spark.sql("""
DROP TABLE IF EXISTS movie_bronze;
""")

spark.sql(f"""
CREATE TABLE movie_bronze
USING DELTA
LOCATION "{bronzePath}"
""")

In [0]:
%sql

SELECT * FROM movie_bronze

datasource,ingesttime,value,status,p_ingestdate
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,{,new,2022-08-23
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,"""movie"": [",new,2022-08-23
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,[,new,2022-08-23
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,{,new,2022-08-23
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,"""Id"": 1,",new,2022-08-23
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,"""Title"": ""Inception"",",new,2022-08-23
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,"""Overview"": ""Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: \""inception\"", the implantation of another person's idea into a target's subconscious."",",new,2022-08-23
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,"""Tagline"": ""Your mind is the scene of the crime."",",new,2022-08-23
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,"""Budget"": 160000000.0,",new,2022-08-23
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,"""Revenue"": 825532764.0,",new,2022-08-23


In [0]:
%sql

SELECT * FROM movie_bronze WHERE value RLIKE 'Inception'

datasource,ingesttime,value,status,p_ingestdate
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,"""Title"": ""Inception: The Cobol Job"",",new,2022-08-23
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,"""Overview"": ""The Cobol Job is a fourteen-minute animated prequel to Christopher Nolan\u2019s award-winning movie: Inception, detailing the heist on Mr. Kaneda's mind by Nash, Cobb, Arthur, and several Cobol Engineering thugs."",",new,2022-08-23
dbfs:/dbfs/FileStore/movie/raw,2022-08-23T17:33:19.501+0000,"""Title"": ""Inception"",",new,2022-08-23
