### INGESTION DEL ARCHIVO JSON movie_genre

In [0]:
dbutils.widgets.text("p_enviroment","")
v_enviroment = dbutils.widgets.get("p_enviroment")

In [0]:
dbutils.widgets.text("p_file_date","2024-12-16")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/common_functions"

In [0]:
%run "../includes/configuration"

#### Paso 1 - Leer el archivo JSON usando "DataFrameReader" de Spark

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
movie_genre_schema = StructType(fields = [
    StructField("movieId",IntegerType(),False),
    StructField("genreId",IntegerType(),False)
])

In [0]:
movie_genre_df = spark.read.schema(movie_genre_schema) \
                        .json(f"{bronze_folder_path}/{v_file_date}/movie_genre.json")

#### Paso 2 - Cambiar el nombre de columnas y agregar columnas como "ingestion_date" y "enviroment"

In [0]:
from pyspark.sql.functions import col, lit, current_timestamp

In [0]:
final_movie_genre_df = add_ingestion_date(movie_genre_df).withColumnRenamed("movieId","movie_id") \
                                     .withColumnRenamed("genreId","genre_id") \
                                     .withColumn("enviroment",lit(v_enviroment)) \
                                     .withColumn("file_date",lit(v_file_date))     

#### Paso 3 - Insertar el dataFrame en formato Parquet en el DL

In [0]:
%sql
SELECT * FROM movie_silver.movie_genre;

In [0]:
#overwrite_partition(final_movie_genre_df,"movie_silver","movie_genre","file_date")
merge_condition = 'tgt.movie_id = src.movie_id AND tgt.genre_id = src.genre_id AND tgt.file_date = src.file_date'
merge_delta_lake(final_movie_genre_df, "movie_silver", "movie_genre", silver_foler_path, merge_condition, "file_date")

In [0]:
%sql
SELECT COUNT(1), file_date 
FROM movie_silver.movie_genre
GROUP BY file_date

In [0]:
dbutils.notebook.exit("Succes!")