# Configuración Inicial
Métodos un utils en común en los notebooks

In [17]:
# Imports 
from pyspark.sql.types import DateType, IntegerType
from pyspark.sql import functions as f, SparkSession
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.functions import col


In [3]:
class MySQLConnector:
    def __init__(self, spark: SparkSession, connection_properties: dict, url: str):
        self.spark = spark
        self.properties = connection_properties
        self.url = url

    def get_dataframe(self, sql_query: str):        
        df = self.spark.read.jdbc(
            url=self.url,
            table=sql_query,
            properties=self.properties
        )
        return df
    
    def save_db(self, df, tabla):
        df.write.jdbc(
            url=self.url,
            table=tabla,
            mode='append',
            properties=self.properties
        )
        
def create_spark_session(path_jar_driver):    
    conf = SparkConf().set('spark.driver.extraClassPath', path_jar_driver)
    spark_context = SparkContext(conf=conf)
    sql_context = SQLContext(spark_context)
    return sql_context.sparkSession    

def get_dataframe_from_csv(_PATH, _sep):
    return spark.read.load(_PATH, format="csv", sep=_sep, inferSchema="true", header='true')

In [5]:
db_user = 'Estudiante_65_202415'
db_psswd = 'Estudiante_202010409'

connection_properties = {
    "user": db_user,
    "password": db_psswd,
    "driver": "com.mysql.cj.jdbc.Driver"
}

source_db_string_connection = 'jdbc:mysql://157.253.236.120:8080/RaSaTransaccional_ETL'
destination_db_string_connection = f'jdbc:mysql://157.253.236.120:8080/{db_user}'

# Driver de conexion
# LINUX
path_jar_driver = '/opt/mysql/lib/mysql-connector-java-8.0.28.jar'
# WINDOWS como esta en la VM 
#path_jar_driver = 'C:\Program Files (x86)\MySQL\Connector J 8.0\mysql-connector-java-8.0.28.jar'

In [6]:
spark = create_spark_session(path_jar_driver)

24/11/20 22:00:59 WARN Utils: Your hostname, willp resolves to a loopback address: 127.0.1.1; using 192.168.0.6 instead (on interface enp8s0)
24/11/20 22:00:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/20 22:01:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/20 22:01:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [7]:
conn_orig = MySQLConnector(spark=spark, connection_properties=connection_properties, url=source_db_string_connection)
conn_dest = MySQLConnector(spark=spark, connection_properties=connection_properties, url=destination_db_string_connection)

# Dimensión Fecha

Fecha es importante para llevar la historia de las tablas hechos.

![Modelo Movimientos](./images/Fecha.png)

### Extraction

Se hace un formateo de las fechas usando SQL para estandarizar la fecha.

In [8]:
#EXTRACCION
sql_move_date = '''
(
SELECT DISTINCT
    CASE
        WHEN Fecha REGEXP '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
            THEN DATE_FORMAT(STR_TO_DATE(TRIM(Fecha), '%Y-%m-%d %H:%i:%s.%f'), '%Y-%m-%d')
        WHEN Fecha REGEXP '^[A-Za-z]{3} [0-9]{1,2},[0-9]{4}$'
            THEN DATE_FORMAT(STR_TO_DATE(TRIM(Fecha), '%b %d,%Y'), '%Y-%m-%d')
        ELSE concat('Invalid Format: ',Fecha)
        END AS Fecha
FROM FuentePlanesBeneficio_ETL
) AS Fecha
'''
df_move_date = conn_orig.get_dataframe(sql_move_date)
df_supplier_move_date = df_move_date.withColumn('Fecha', col('Fecha').cast(DateType()))
df_supplier_move_date.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+----------+
|     Fecha|
+----------+
|2017-12-31|
|2019-12-31|
|2020-12-31|
|2021-12-31|
|2018-12-31|
+----------+



                                                                                

## Transformation


Adicionar las columnas de la dimension

In [9]:
df_supplier_move_date = df_supplier_move_date.withColumn(
    "IdFecha", f.date_format("Fecha", "yyyyMMdd").cast("int")
).withColumn(
    "Dia", f.dayofmonth("Fecha").cast("int")
).withColumn(
    "Mes", f.month("Fecha").cast("int")
).withColumn(
    "Annio", f.year("Fecha").cast("int"))

In [11]:
df_supplier_move_date.printSchema()

root
 |-- Fecha: date (nullable = true)
 |-- IdFecha: integer (nullable = true)
 |-- Dia: integer (nullable = true)
 |-- Mes: integer (nullable = true)
 |-- Annio: integer (nullable = true)



Adicionar comodín


In [22]:
dates_data = [( "9999-12-31", 99991231, 31, 12, 9999)]
dates_columns = ["Fecha", "IdFecha", "Dia", "Mes", "Annio"]
dummy_dates = spark.createDataFrame(dates_data, dates_columns)

dummy_dates = dummy_dates.withColumn('IdFecha', col('IdFecha').cast(IntegerType())) \
    .withColumn('Fecha', col('Fecha').cast(DateType())) \
    .withColumn('Dia', col('Dia').cast(IntegerType())) \
    .withColumn('Mes', col('Mes').cast(IntegerType())) \
    .withColumn('Annio', col('Annio').cast(IntegerType()))
dummy_dates.show()
dummy_dates.printSchema()


+----------+--------+---+---+-----+
|     Fecha| IdFecha|Dia|Mes|Annio|
+----------+--------+---+---+-----+
|9999-12-31|99991231| 31| 12| 9999|
+----------+--------+---+---+-----+

root
 |-- Fecha: date (nullable = true)
 |-- IdFecha: integer (nullable = true)
 |-- Dia: integer (nullable = true)
 |-- Mes: integer (nullable = true)
 |-- Annio: integer (nullable = true)



In [23]:
df_supplier_move_date = dummy_dates.union(df_supplier_move_date)
df_supplier_move_date.show(40)



+----------+--------+---+---+-----+
|     Fecha| IdFecha|Dia|Mes|Annio|
+----------+--------+---+---+-----+
|9999-12-31|99991231| 31| 12| 9999|
|2017-12-31|20171231| 31| 12| 2017|
|2019-12-31|20191231| 31| 12| 2019|
|2020-12-31|20201231| 31| 12| 2020|
|2021-12-31|20211231| 31| 12| 2021|
|2018-12-31|20181231| 31| 12| 2018|
+----------+--------+---+---+-----+



                                                                                

In [24]:
df_supplier_move_date.orderBy(col('IdFecha'))

DataFrame[Fecha: date, IdFecha: int, Dia: int, Mes: int, Annio: int]

## Load


Persistencia de la dimension.

In [25]:
conn_dest.save_db(df_supplier_move_date, "Rs_Fecha")

                                                                                