# Configuración Inicial
Métodos un utils en común en los notebooks

In [2]:
# Imports 
from pyspark.sql.types import DateType
from pyspark.sql import functions as f, SparkSession
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.functions import col


In [3]:
class MySQLConnector:
    def __init__(self, spark: SparkSession, connection_properties: dict, url: str):
        self.spark = spark
        self.properties = connection_properties
        self.url = url

    def get_dataframe(self, sql_query: str):        
        df = self.spark.read.jdbc(
            url=self.url,
            table=sql_query,
            properties=self.properties
        )
        return df
    
    def save_db(self, df, tabla):
        df.write.jdbc(
            url=self.url,
            table=tabla,
            mode='append',
            properties=self.properties
        )
        
def create_spark_session(path_jar_driver):    
    conf = SparkConf().set('spark.driver.extraClassPath', path_jar_driver)
    spark_context = SparkContext(conf=conf)
    sql_context = SQLContext(spark_context)
    return sql_context.sparkSession    

def get_dataframe_from_csv(_PATH, _sep):
    return spark.read.load(_PATH, format="csv", sep=_sep, inferSchema="true", header='true')

In [4]:
db_user = 'Estudiante_65_202415'
db_psswd = 'Estudiante_202010409'

connection_properties = {
    "user": db_user,
    "password": db_psswd,
    "driver": "com.mysql.cj.jdbc.Driver"
}

source_db_string_connection = 'jdbc:mysql://157.253.236.120:8080/RaSaTransaccional_ETL'
destination_db_string_connection = f'jdbc:mysql://157.253.236.120:8080/{db_user}'

# Driver de conexion
# LINUX
path_jar_driver = '/opt/mysql/lib/mysql-connector-java-8.0.28.jar'
# WINDOWS como esta en la VM 
#path_jar_driver = 'C:\Program Files (x86)\MySQL\Connector J 8.0\mysql-connector-java-8.0.28.jar'

In [5]:
spark = create_spark_session(path_jar_driver)

24/11/18 15:43:39 WARN Utils: Your hostname, willp resolves to a loopback address: 127.0.1.1; using 192.168.0.6 instead (on interface enp8s0)
24/11/18 15:43:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/18 15:43:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/18 15:43:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
conn_orig = MySQLConnector(spark=spark, connection_properties=connection_properties, url=source_db_string_connection)
conn_dest = MySQLConnector(spark=spark, connection_properties=connection_properties, url=destination_db_string_connection)

# Dimensión Fecha

Fecha es importante para llevar la historia de las tablas hechos.

![Modelo Movimientos](./images/Fecha.png)

### Extraction

Se hace un formateo de las fechas usando SQL para estandarizar la fecha.

In [7]:
#EXTRACCION
sql_move_date = '''
(
SELECT DISTINCT
    CASE
        WHEN Fecha REGEXP '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
            THEN DATE_FORMAT(STR_TO_DATE(TRIM(Fecha), '%Y-%m-%d %H:%i:%s.%f'), '%Y-%m-%d')
        WHEN Fecha REGEXP '^[A-Za-z]{3} [0-9]{1,2},[0-9]{4}$'
            THEN DATE_FORMAT(STR_TO_DATE(TRIM(Fecha), '%b %d,%Y'), '%Y-%m-%d')
        ELSE concat('Invalid Format: ',Fecha)
        END AS Fecha
FROM FuentePlanesBeneficio_ETL
) AS Fecha
'''
df_move_date = conn_orig.get_dataframe(sql_move_date)
df_supplier_move_date = df_move_date.withColumn('Fecha', col('Fecha').cast(DateType()))
df_supplier_move_date.show(5)

+----------+
|     Fecha|
+----------+
|2017-12-31|
|2019-12-31|
|2020-12-31|
|2021-12-31|
|2018-12-31|
+----------+



## Transformation


Adicionar las columnas de la dimension

In [8]:
df_supplier_move_date = df_supplier_move_date.withColumn(
    "IdFecha", f.date_format("Fecha", "yyyyMMdd").cast("int")
).withColumn(
    "Dia", f.dayofmonth("Fecha").cast("int")
).withColumn(
    "Mes", f.month("Fecha").cast("int")
).withColumn(
    "Annio", f.year("Fecha").cast("int"))

## Load


Persistencia de la dimension.

In [9]:
conn_dest.save_db(df_supplier_move_date, "Rs_Fecha")