# Configuración Inicial
Métodos un utils en común en los notebooks

In [1]:
# Imports 
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql import functions as f, SparkSession
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.functions import when, col


In [2]:
class MySQLConnector:
    def __init__(self, spark: SparkSession, connection_properties: dict, url: str):
        self.spark = spark
        self.properties = connection_properties
        self.url = url

    def get_dataframe(self, sql_query: str):        
        df = self.spark.read.jdbc(
            url=self.url,
            table=sql_query,
            properties=self.properties
        )
        return df
    
    def save_db(self, df, tabla):
        df.write.jdbc(
            url=self.url,
            table=tabla,
            mode='append',
            properties=self.properties
        )
        
def create_spark_session(path_jar_driver):    
    conf = SparkConf().set('spark.driver.extraClassPath', path_jar_driver)
    spark_context = SparkContext(conf=conf)
    sql_context = SQLContext(spark_context)
    return sql_context.sparkSession    

def get_dataframe_from_csv(_PATH, _sep):
    return spark.read.load(_PATH, format="csv", sep=_sep, inferSchema="true", header='true')

In [15]:
db_user = 'Estudiante_65_202415'
db_psswd = 'Estudiante_202010409'

connection_properties = {
    "user": db_user,
    "password": db_psswd,
    "driver": "com.mysql.cj.jdbc.Driver"
}

source_db_string_connection = 'jdbc:mysql://157.253.236.120:8080/RaSaTransaccional_ETL'
destination_db_string_connection = f'jdbc:mysql://157.253.236.120:8080/{db_user}'

# jdbc:mysql://157.253.236.120:8080/WWImporters_DWH_tablero

# Driver de conexion
# LINUX
path_jar_driver = '/opt/mysql/lib/mysql-connector-java-8.0.28.jar'
# WINDOWS
#path_jar_driver = 'C:\Users\Rodolfo\OneDrive\Maestria MIAD\Semestre 1\Ciclo 2\Modelado de Datos y ETL\Semana 2\mysql-connector-j-9.0.0.jar'
# WINDOWS como esta en la VM 
#path_jar_driver = 'C:\Program Files (x86)\MySQL\Connector J 8.0\mysql-connector-java-8.0.28.jar'

In [4]:
spark = create_spark_session(path_jar_driver)

24/11/18 18:03:22 WARN Utils: Your hostname, willp resolves to a loopback address: 127.0.1.1; using 192.168.0.6 instead (on interface enp8s0)
24/11/18 18:03:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/18 18:03:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [16]:
conn_orig = MySQLConnector(spark=spark, connection_properties=connection_properties, url=source_db_string_connection)
conn_dest = MySQLConnector(spark=spark, connection_properties=connection_properties, url=destination_db_string_connection)

# Dimensión Condiciones de Pago

![Modelo Movimientos](./images/CondPago.png)

### Extraction

In [17]:
sql_Rs_CondicionDePago = '''
(
SELECT DISTINCT 
    IdCondicionesDePago_T AS IdCondicionDePago_T, 
    Descripcion, 
    Tipo 
FROM FuenteCondicionesDePago_ETL
ORDER BY IdCondicionDePago_T
) AS CondicionDePago
'''

df_Rs_CondicionDePago = conn_orig.get_dataframe(sql_Rs_CondicionDePago)
df_Rs_CondicionDePago = df_Rs_CondicionDePago \
    .withColumn('IdCondicionDePago_T', col('IdCondicionDePago_T').cast(IntegerType())) \
    .withColumn('Descripcion', col('Descripcion').cast(StringType())) \
    .withColumn('Tipo', col('Tipo').cast(StringType())) \
    .orderBy("IdCondicionDePago_T")


In [18]:
df_Rs_CondicionDePago.show(40)

+-------------------+--------------------+-----------+
|IdCondicionDePago_T|         Descripcion|       Tipo|
+-------------------+--------------------+-----------+
|                  9|           No Charge|Coseguridad|
|                 17|Copay per Day aft...|   Copagado|
|                 18|No Charge after d...|   Coseguro|
|                 27|Coinsurance after...|   Coseguro|
|                 34|      Not Applicable|   Copagado|
|                 36|      Not Applicable|   Coseguro|
|                 45|         Coinsurance|   Coseguro|
|                 51|           No Charge|     Copago|
|                 68|Copay per Stay af...|     Copago|
|                 85|Copay per Day bef...|     Copago|
|                102|Copay per Stay be...|     Copago|
|                119|Copay per Day wit...|     Copago|
|                136|Copay per Stay wi...|   Copagado|
|                153|Copay after deduc...|     Copago|
|                170|Copay before dedu...|     Copago|
|         

## Transformation

Realizar las transformaciones en la variable "Tipo" para que únicamente que valores Copago y Coseguro 

In [19]:
# Realizar las transformaciones en la columna "Tipo"
df_Rs_CondicionDePago = df_Rs_CondicionDePago.withColumn(
    "Tipo",
    when(col("Tipo") == "Copagado", "Copago")
    .when(col("Tipo") == "Coseguridad", "Coseguro")
    .otherwise(col("Tipo"))
)

# Mostrar los resultados para verificar las transformaciones
df_Rs_CondicionDePago.show(40)

+-------------------+--------------------+--------+
|IdCondicionDePago_T|         Descripcion|    Tipo|
+-------------------+--------------------+--------+
|                  9|           No Charge|Coseguro|
|                 17|Copay per Day aft...|  Copago|
|                 18|No Charge after d...|Coseguro|
|                 27|Coinsurance after...|Coseguro|
|                 34|      Not Applicable|  Copago|
|                 36|      Not Applicable|Coseguro|
|                 45|         Coinsurance|Coseguro|
|                 51|           No Charge|  Copago|
|                 68|Copay per Stay af...|  Copago|
|                 85|Copay per Day bef...|  Copago|
|                102|Copay per Stay be...|  Copago|
|                119|Copay per Day wit...|  Copago|
|                136|Copay per Stay wi...|  Copago|
|                153|Copay after deduc...|  Copago|
|                170|Copay before dedu...|  Copago|
|                187|Copay with deduct...|  Copago|
|           

Incluir la variable de la base de datos IdCondicionDePago_DWH y organizar la estructura de la tabla

Adicionar comodin cuando la referencia no exista

In [20]:
df_Rs_CondicionDePago = df_Rs_CondicionDePago.coalesce(1).withColumn('IdCondicionDePago_DWH', f.monotonically_increasing_id() + 1)
df_Rs_CondicionDePago.show(40)

+-------------------+--------------------+--------+---------------------+
|IdCondicionDePago_T|         Descripcion|    Tipo|IdCondicionDePago_DWH|
+-------------------+--------------------+--------+---------------------+
|                  9|           No Charge|Coseguro|                    1|
|                 17|Copay per Day aft...|  Copago|                    2|
|                 18|No Charge after d...|Coseguro|                    3|
|                 27|Coinsurance after...|Coseguro|                    4|
|                 34|      Not Applicable|  Copago|                    5|
|                 36|      Not Applicable|Coseguro|                    6|
|                 45|         Coinsurance|Coseguro|                    7|
|                 51|           No Charge|  Copago|                    8|
|                 68|Copay per Stay af...|  Copago|                    9|
|                 85|Copay per Day bef...|  Copago|                   10|
|                102|Copay per Stay be

In [21]:
cond_data = [(0, "Missing", "Missing", 0)]
cond_columns = ["IdCondicionDePago_T", "Descripcion", "Tipo", "IdCondicionDePago_DWH"]
dummy_cond_pago = spark.createDataFrame(cond_data, cond_columns)

df_Rs_CondicionDePago = dummy_cond_pago.union(df_Rs_CondicionDePago)
df_Rs_CondicionDePago.show(40)


                                                                                

+-------------------+--------------------+--------+---------------------+
|IdCondicionDePago_T|         Descripcion|    Tipo|IdCondicionDePago_DWH|
+-------------------+--------------------+--------+---------------------+
|                  0|             Missing| Missing|                    0|
|                  9|           No Charge|Coseguro|                    1|
|                 17|Copay per Day aft...|  Copago|                    2|
|                 18|No Charge after d...|Coseguro|                    3|
|                 27|Coinsurance after...|Coseguro|                    4|
|                 34|      Not Applicable|  Copago|                    5|
|                 36|      Not Applicable|Coseguro|                    6|
|                 45|         Coinsurance|Coseguro|                    7|
|                 51|           No Charge|  Copago|                    8|
|                 68|Copay per Stay af...|  Copago|                    9|
|                 85|Copay per Day bef

## Load

In [10]:
df_Rs_CondicionDePago.printSchema()

root
 |-- IdCondicionDePago_T: integer (nullable = true)
 |-- Descripcion: string (nullable = true)
 |-- Tipo: string (nullable = true)
 |-- IdCondicionDePago_DWH: long (nullable = false)



In [22]:
df_Rs_CondicionDePago = df_Rs_CondicionDePago \
    .withColumn('IdCondicionDePago_DWH', col('IdCondicionDePago_DWH').cast(IntegerType())) \
    .withColumn('IdCondicionDePago_T', col('IdCondicionDePago_T').cast(IntegerType())) \
    .withColumn('Descripcion', col('Descripcion').cast(StringType())) \
    .withColumn('Tipo', col('Tipo').cast(StringType())) 
df_Rs_CondicionDePago.printSchema()


root
 |-- IdCondicionDePago_T: integer (nullable = true)
 |-- Descripcion: string (nullable = true)
 |-- Tipo: string (nullable = true)
 |-- IdCondicionDePago_DWH: integer (nullable = true)



In [24]:
df_Rs_CondicionDePago.count()

21

In [23]:
# Cargue de la dimension
conn_dest.save_db(df_Rs_CondicionDePago, "Rs_CondicionDePago")

# Dimensión Proveedor

### Extraction
Aplicando el distinct se garantiza la unicidad

In [53]:
sql_Rs_Proveedor = "(SELECT DISTINCT IdProveedor_T FROM FuentePlanesBeneficio_ETL) AS Rs_Proveedor"

df_Rs_Proveedor = conn_orig.get_dataframe(sql_Rs_Proveedor)
df_Rs_Proveedor.withColumn('IdProveedor_T', col('IdProveedor_T').cast(IntegerType()))

DataFrame[IdProveedor_T: int]

In [54]:
df_Rs_Proveedor.show()

+-------------+
|IdProveedor_T|
+-------------+
|        16842|
|        14002|
|        19722|
|        81413|
|        52697|
|        28162|
|        20129|
|        40572|
|        38166|
|        70893|
|        25268|
|        36096|
|        95185|
|        93078|
|        66252|
|        27248|
|        47840|
|        30751|
|        49046|
|        48121|
+-------------+
only showing top 20 rows



In [48]:
df_Rs_Proveedor.printSchema()

root
 |-- IdProveedor_T: integer (nullable = true)



### Seleccionar los valores únicos de IdProveedor_T

In [55]:
df_Rs_Proveedor = df_Rs_Proveedor.select("IdProveedor_T").distinct().orderBy("IdProveedor_T")

df_Rs_Proveedor.show()

+-------------+
|IdProveedor_T|
+-------------+
|        10207|
|        11269|
|        11469|
|        11512|
|        12303|
|        12379|
|        12858|
|        14002|
|        14609|
|        15560|
|        15833|
|        16322|
|        16842|
|        16985|
|        18239|
|        18350|
|        19636|
|        19722|
|        20069|
|        20129|
+-------------+
only showing top 20 rows



In [56]:
print((df_Rs_Proveedor.count(), len(df_Rs_Proveedor.columns)))

(171, 1)


In [57]:
df_Rs_Proveedor = df_Rs_Proveedor.coalesce(1).withColumn('IdProveedor_DWH', f.monotonically_increasing_id() + 1)
df_Rs_Proveedor = df_Rs_Proveedor.select("IdProveedor_DWH", "IdProveedor_T")
df_Rs_Proveedor.show()

+---------------+-------------+
|IdProveedor_DWH|IdProveedor_T|
+---------------+-------------+
|              1|        10207|
|              2|        11269|
|              3|        11469|
|              4|        11512|
|              5|        12303|
|              6|        12379|
|              7|        12858|
|              8|        14002|
|              9|        14609|
|             10|        15560|
|             11|        15833|
|             12|        16322|
|             13|        16842|
|             14|        16985|
|             15|        18239|
|             16|        18350|
|             17|        19636|
|             18|        19722|
|             19|        20069|
|             20|        20129|
+---------------+-------------+
only showing top 20 rows



In [60]:
proveedor_data = [(0, 0)]
proveedor_columns = ["IdProveedor_T", "IdProveedor_DWH"]
dummy_proveedor = spark.createDataFrame(proveedor_data, proveedor_columns)                          

df_Rs_Proveedor = dummy_proveedor.union(df_Rs_Proveedor)
df_Rs_Proveedor.show(40)

+-------------+---------------+
|IdProveedor_T|IdProveedor_DWH|
+-------------+---------------+
|            0|              0|
|            1|          10207|
|            2|          11269|
|            3|          11469|
|            4|          11512|
|            5|          12303|
|            6|          12379|
|            7|          12858|
|            8|          14002|
|            9|          14609|
|           10|          15560|
|           11|          15833|
|           12|          16322|
|           13|          16842|
|           14|          16985|
|           15|          18239|
|           16|          18350|
|           17|          19636|
|           18|          19722|
|           19|          20069|
|           20|          20129|
|           21|          20305|
|           22|          20507|
|           23|          21663|
|           24|          21989|
|           25|          22444|
|           26|          23426|
|           27|          23552|
|       

## Load

In [61]:
# CARGUE
conn_dest.save_db(df_Rs_Proveedor, "Rs_Proveedor")

## Dimensión NivelesDeServicio

### Extraction

In [64]:
sql_Rs_NivelesDeServicio = 'RaSaTransaccional_ETL.NivelesDeServicio'

df_Rs_NivelesDeServicio = conn_orig.get_dataframe(sql_Rs_NivelesDeServicio)
df_Rs_NivelesDeServicio = df_Rs_NivelesDeServicio \
    .withColumn('IdNivelDeServicio_DWH', col('IdNivelDeServicio_DWH').cast(IntegerType())) \
    .withColumn('IdNivelDeServicio_T', col('IdNivelDeServicio_T').cast(IntegerType())) \
    .withColumn('Descripcion', col('Descripcion').cast(StringType()))

In [65]:
df_Rs_NivelesDeServicio.show()

+---------------------+-------------------+---------------+
|IdNivelDeServicio_DWH|IdNivelDeServicio_T|    Descripcion|
+---------------------+-------------------+---------------+
|                    1|                  1|        Nivel 1|
|                    2|                  2|        Nivel 2|
|                    3|                  3|Fuera de la red|
+---------------------+-------------------+---------------+



## Transformation

In [66]:
nivel_data = [(0, 0, "Missing")]
nivel_columns = ["IdNivelDeServicio_DWH", "IdNivelDeServicio_T", "Descripcion"]
dummy_nivel = spark.createDataFrame(nivel_data, nivel_columns)                          

df_Rs_NivelesDeServicio = dummy_nivel.union(df_Rs_NivelesDeServicio)
df_Rs_NivelesDeServicio.show(40)

+---------------------+-------------------+---------------+
|IdNivelDeServicio_DWH|IdNivelDeServicio_T|    Descripcion|
+---------------------+-------------------+---------------+
|                    0|                  0|        Missing|
|                    1|                  1|        Nivel 1|
|                    2|                  2|        Nivel 2|
|                    3|                  3|Fuera de la red|
+---------------------+-------------------+---------------+



No existen Transformaciones

## Load

In [67]:
# CARGUE
conn_dest.save_db(df_Rs_NivelesDeServicio, "Rs_NivelesDeServicio")  