### Ingestion del archivo "genre.csv"

In [0]:
dbutils.widgets.text("p_enviroment","")
v_enviroment = dbutils.widgets.get("p_enviroment")

In [0]:
dbutils.widgets.text("p_file_date","2024-12-16")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/common_functions"

In [0]:
%run "../includes/configuration"

#### Paso 1 - Leer el archivo CSV usando "DataFrameReader" de Spark

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [0]:
genre_schema = StructType( fields = [
        StructField("genreId", IntegerType(), False),
        StructField("genreName", StringType(), True),
])

In [0]:
genre_df = spark.read \
                    .option("header", True) \
                    .schema(genre_schema) \
                    .csv(f"{bronze_folder_path}/{v_file_date}/genre.csv")


#### Paso 2 - Seleccionar solos las columnas "requeridas"

In [0]:
from pyspark.sql.functions import col

In [0]:
genre_selected_df = genre_df.select(col("genreId"), col("genreName"))

#### Paso 3 - Renombrar las columnas

In [0]:
genre_renamed_df = genre_selected_df.withColumnsRenamed({"genreId":"genre_id", "genreName":"genre_name"})

#### Parte 4 - Agregar la columna "ingestion_date" y "enviroment" al DataFrame

In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
genre_final_df = add_ingestion_date(genre_renamed_df) \
                  .withColumn("enviroment",lit(v_enviroment)) \
                  .withColumn("file_date",lit(v_file_date))

#### Paso 5 - Escribir en el DataLake en formato Parquet

In [0]:
genre_final_df.write \
                .mode("overwrite") \
                .format("delta") \
                .saveAsTable("movie_silver.genre")

In [0]:
%fs
ls /mnt/moviehistory9/silver/genre

In [0]:
dbutils.notebook.exit("Succes!")