# ETL: Carga Areas de Servicio

In [170]:
# Imports 
from pyspark.sql.types import IntegerType, StringType, DateType, LongType
from pyspark.sql import functions as f, SparkSession, types as t
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.functions import col, length

In [171]:
class MySQLConnector:
    def __init__(self, spark: SparkSession, connection_properties: dict, url: str):
        self.spark = spark
        self.properties = connection_properties
        self.url = url

    def get_dataframe(self, sql_query: str):        
        df = self.spark.read.jdbc(
            url=self.url,
            table=sql_query,
            properties=self.properties
        )
        return df
    
    def save_db(self, df, tabla):
        df.write.jdbc(
            url=self.url,
            table=tabla,
            mode='append',
            properties=self.properties
        )
        
def create_spark_session(path_jar_driver):    
    conf = SparkConf().set('spark.driver.extraClassPath', path_jar_driver)
    spark_context = SparkContext(conf=conf)
    sql_context = SQLContext(spark_context)
    return sql_context.sparkSession    

def get_dataframe_from_csv(_PATH, _sep):
    return spark.read.load(_PATH, format="csv", sep=_sep, inferSchema="true", header='true')

In [167]:
# LLENAR CON EL USUARIO DE CADA UNO
db_user = 'Estudiante_8_202415'
db_psswd = 'Estudiante_200723002'


connection_properties = {
    "user": db_user,
    "password": db_psswd,
    "driver": "com.mysql.cj.jdbc.Driver"
}

source_db_string_connection = 'jdbc:mysql://157.253.236.120:8080/RaSaTransaccional_ETL'
destination_db_string_connection = f'jdbc:mysql://157.253.236.120:8080/{db_user}'

# Driver de conexion
# LINUX
#path_jar_driver = '/opt/mysql/lib/mysql-connector-java-8.0.28.jar'
# WINDOWS
path_jar_driver = 'C:\Program Files (x86)\MySQL\Connector J 8.0\mysql-connector-java-8.0.28.jar'

In [172]:
spark = create_spark_session(path_jar_driver)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at C:\Users\estudiante\AppData\Local\Temp\ipykernel_12468\3043745556.py:25 

In [173]:
conn_orig = MySQLConnector(spark=spark, connection_properties=connection_properties, url=source_db_string_connection)
conn_dest = MySQLConnector(spark=spark, connection_properties=connection_properties, url=destination_db_string_connection)

## Proceso de ETL para una dimensión.

![Modelo Movimientos](./images/AreasDeServicio.png)

## Cargar FuenteAreasDeServicio_ETL

In [174]:
# Extracción de todos los campos de FuenteAreasDeServicio_ETL
sql_areas_servicio_all_fields = '''
(SELECT *
 FROM RaSaTransaccional_ETL.FuenteAreasDeServicio_ETL) AS AreasDeServicio
'''

try:
    df_areas_servicio = conn_orig.get_dataframe(sql_areas_servicio_all_fields)
    print("Primeros 5 registros de FuenteAreasDeServicio_ETL:")
    df_areas_servicio.show(5, truncate=False)
except Exception as e:
    print("Error al extraer datos de FuenteAreasDeServicio_ETL:", e)


Primeros 5 registros de FuenteAreasDeServicio_ETL:
+------------------+----------------------------------------------+-------------+-----------------+----------+------------+-----+--------+-----+
|IdAreaDeServicio_T|NombreAreaDeServicio                          |IdGeografia_T|Condado          |Estado    |PoblacionAct|Area |Densidad|Fecha|
+------------------+----------------------------------------------+-------------+-----------------+----------+------------+-----+--------+-----+
|100622017         |New Jersey - Medical91661NJ2340003-0520174859 |34005        |Burlington County|New Jersey|464269      |805.0|577.0   |2017 |
|100722019         |New Jersey  - Medical91661NJ2340003-0520194597|34023        |Middlesex County |New Jersey|860807      |311.0|2768.0  |2019 |
|100922020         |New Jersey - Medical91661NJ2340003-0520204858 |34019        |Hunterdon County |New Jersey|129924      |430.0|302.0   |2020 |
|101012018         |New Jersey - Medical91661NJ2340003-0520184611 |34031       

# Areas de Servicio

## Extracción de datos de AreasDeServicio

In [175]:
# Extracción de datos de AreasDeServicio
sql_areas_servicio = '''
(SELECT DISTINCT 
    IdAreaDeServicio_T AS IdAreaDeServicio_DWH, 
    IdAreaDeServicio_T, 
    NombreAreaDeServicio AS Nombre, 
    Fecha AS AnnoCreacion 
 FROM RaSaTransaccional_ETL.FuenteAreasDeServicio_ETL) AS AreasDeServicio
'''

try:
    df_areas_servicio = conn_orig.get_dataframe(sql_areas_servicio)
    print("Primeros 5 registros de AreasDeServicio:")
    df_areas_servicio.show(5, truncate=False)
except Exception as e:
    print("Error durante la extracción de datos de AreasDeServicio:", e)


Primeros 5 registros de AreasDeServicio:
+--------------------+------------------+----------------------------------------------+------------+
|IdAreaDeServicio_DWH|IdAreaDeServicio_T|Nombre                                        |AnnoCreacion|
+--------------------+------------------+----------------------------------------------+------------+
|100622017           |100622017         |New Jersey - Medical91661NJ2340003-0520174859 |2017        |
|100722019           |100722019         |New Jersey  - Medical91661NJ2340003-0520194597|2019        |
|100922020           |100922020         |New Jersey - Medical91661NJ2340003-0520204858 |2020        |
|101012018           |101012018         |New Jersey - Medical91661NJ2340003-0520184611 |2018        |
|101062020           |101062020         |New Jersey - Medical91661NJ2340003-0520205037 |2020        |
+--------------------+------------------+----------------------------------------------+------------+
only showing top 5 rows



## Transformación 

### 1. Crear un identificador único y consecutivo 

In [176]:
from pyspark.sql.functions import monotonically_increasing_id

df_areas_servicio = df_areas_servicio.withColumn("IdAreaDeServicio_DWH", monotonically_increasing_id() + 1)

df_areas_servicio.show(5)


+--------------------+------------------+--------------------+------------+
|IdAreaDeServicio_DWH|IdAreaDeServicio_T|              Nombre|AnnoCreacion|
+--------------------+------------------+--------------------+------------+
|                   1|         100622017|New Jersey - Medi...|        2017|
|                   2|         100722019|New Jersey  - Med...|        2019|
|                   3|         100922020|New Jersey - Medi...|        2020|
|                   4|         101012018|New Jersey - Medi...|        2018|
|                   5|         101062020|New Jersey - Medi...|        2020|
+--------------------+------------------+--------------------+------------+
only showing top 5 rows



### 2. Validación de duplicados para IdAreaDeServicio_T

In [177]:
duplicate_count = df_areas_servicio.groupBy("IdAreaDeServicio_T").count().filter(col("count") > 1).count()
if duplicate_count > 0:
    print(f"Se encontraron {duplicate_count} duplicados en IdAreaDeServicio_T")
else:
    print("No se encontraron duplicados en IdAreaDeServicio_T")


No se encontraron duplicados en IdAreaDeServicio_T


### 3. Modificar Nombre para quitar espacios etc

In [178]:
from pyspark.sql.functions import trim, regexp_replace

df_areas_servicio = df_areas_servicio.withColumn(
    "Nombre",
    trim(regexp_replace(col("Nombre"), " {2,}", " "))
)

df_areas_servicio.show(5)

+--------------------+------------------+--------------------+------------+
|IdAreaDeServicio_DWH|IdAreaDeServicio_T|              Nombre|AnnoCreacion|
+--------------------+------------------+--------------------+------------+
|                   1|         100622017|New Jersey - Medi...|        2017|
|                   2|         100722019|New Jersey - Medi...|        2019|
|                   3|         100922020|New Jersey - Medi...|        2020|
|                   4|         101012018|New Jersey - Medi...|        2018|
|                   5|         101062020|New Jersey - Medi...|        2020|
+--------------------+------------------+--------------------+------------+
only showing top 5 rows



### 4. Revisar el tipo de datos 

In [179]:
df_areas_servicio.printSchema()

root
 |-- IdAreaDeServicio_DWH: long (nullable = false)
 |-- IdAreaDeServicio_T: integer (nullable = true)
 |-- Nombre: string (nullable = true)
 |-- AnnoCreacion: integer (nullable = true)



# Cargar los datos transformados

In [180]:
# Carga de los datos transformados
conn_dest.save_db(df_areas_servicio, 'Estudiante_8_202415.Rs_AreasDeServicio')

print("Datos cargados correctamente en la tabla 'Rs_AreasDeServicio'")


Py4JJavaError: An error occurred while calling o611.jdbc.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 10.0 failed 1 times, most recent failure: Lost task 0.0 in stage 10.0 (TID 7) (MISW--09682.sis.virtual.uniandes.edu.co executor driver): java.sql.BatchUpdateException: Duplicate entry '1000' for key 'Rs_AreasDeServicio.PRIMARY'
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
	at com.mysql.cj.util.Util.handleNewInstance(Util.java:192)
	at com.mysql.cj.util.Util.getInstance(Util.java:167)
	at com.mysql.cj.util.Util.getInstance(Util.java:174)
	at com.mysql.cj.jdbc.exceptions.SQLError.createBatchUpdateException(SQLError.java:224)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:853)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchInternal(ClientPreparedStatement.java:435)
	at com.mysql.cj.jdbc.StatementImpl.executeBatch(StatementImpl.java:795)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:748)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1(JdbcUtils.scala:904)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1$adapted(JdbcUtils.scala:903)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.sql.SQLIntegrityConstraintViolationException: Duplicate entry '1000' for key 'Rs_AreasDeServicio.PRIMARY'
	at com.mysql.cj.jdbc.exceptions.SQLError.createSQLException(SQLError.java:117)
	at com.mysql.cj.jdbc.exceptions.SQLExceptionsMapping.translateException(SQLExceptionsMapping.java:122)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeInternal(ClientPreparedStatement.java:953)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeUpdateInternal(ClientPreparedStatement.java:1098)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:832)
	... 19 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$1(RDD.scala:1039)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:1037)
	at org.apache.spark.sql.Dataset.$anonfun$foreachPartition$1(Dataset.scala:3516)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.sql.Dataset.$anonfun$withNewRDDExecutionId$1(Dataset.scala:4310)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withNewRDDExecutionId(Dataset.scala:4308)
	at org.apache.spark.sql.Dataset.foreachPartition(Dataset.scala:3516)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.saveTable(JdbcUtils.scala:903)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:70)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:251)
	at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:766)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.sql.BatchUpdateException: Duplicate entry '1000' for key 'Rs_AreasDeServicio.PRIMARY'
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
	at com.mysql.cj.util.Util.handleNewInstance(Util.java:192)
	at com.mysql.cj.util.Util.getInstance(Util.java:167)
	at com.mysql.cj.util.Util.getInstance(Util.java:174)
	at com.mysql.cj.jdbc.exceptions.SQLError.createBatchUpdateException(SQLError.java:224)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:853)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchInternal(ClientPreparedStatement.java:435)
	at com.mysql.cj.jdbc.StatementImpl.executeBatch(StatementImpl.java:795)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:748)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1(JdbcUtils.scala:904)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1$adapted(JdbcUtils.scala:903)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.sql.SQLIntegrityConstraintViolationException: Duplicate entry '1000' for key 'Rs_AreasDeServicio.PRIMARY'
	at com.mysql.cj.jdbc.exceptions.SQLError.createSQLException(SQLError.java:117)
	at com.mysql.cj.jdbc.exceptions.SQLExceptionsMapping.translateException(SQLExceptionsMapping.java:122)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeInternal(ClientPreparedStatement.java:953)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeUpdateInternal(ClientPreparedStatement.java:1098)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:832)
	... 19 more


# Geografía 

### Extracción de datos de Geografia

In [182]:
# Extracción de datos de Geografia
sql_geografia = '''
(SELECT DISTINCT 
    IdGeografia_T AS IdGeografia_DWH, 
    IdGeografia_T, 
    Estado, 
    Condado, 
    Area AS AreaAct, 
    Densidad AS DensidadAct, 
    PoblacionAct 
 FROM RaSaTransaccional_ETL.FuenteAreasDeServicio_ETL) AS Geografia
'''

try:
    df_geografia = conn_orig.get_dataframe(sql_geografia)
    print("Primeros 5 registros de Geografia:")
    df_geografia.show(5, truncate=False)
except Exception as e:
    print("Error durante la extracción de datos de Geografia:", e)


Primeros 5 registros de Geografia:
+---------------+-------------+----------+-----------------+-------+-----------+------------+
|IdGeografia_DWH|IdGeografia_T|Estado    |Condado          |AreaAct|DensidadAct|PoblacionAct|
+---------------+-------------+----------+-----------------+-------+-----------+------------+
|34005          |34005        |New Jersey|Burlington County|805.0  |577.0      |464269      |
|34023          |34023        |New Jersey|Middlesex County |311.0  |2768.0     |860807      |
|34019          |34019        |New Jersey|Hunterdon County |430.0  |302.0      |129924      |
|34031          |34031        |New Jersey|Passaic County   |185.0  |2801.0     |518117      |
|34037          |34037        |New Jersey|Sussex County    |521.0  |279.0      |145543      |
+---------------+-------------+----------+-----------------+-------+-----------+------------+
only showing top 5 rows



## Transformation

### 1. Crear un identificador único y consecutivo 

In [183]:
from pyspark.sql.functions import monotonically_increasing_id

# Crear identificador único y consecutivo
df_geografia = df_geografia.withColumn("IdGeografia_DWH", monotonically_increasing_id() + 1)

# Validar los cambios
df_geografia.show(5, truncate=False)


+---------------+-------------+----------+-----------------+-------+-----------+------------+
|IdGeografia_DWH|IdGeografia_T|Estado    |Condado          |AreaAct|DensidadAct|PoblacionAct|
+---------------+-------------+----------+-----------------+-------+-----------+------------+
|1              |34005        |New Jersey|Burlington County|805.0  |577.0      |464269      |
|2              |34023        |New Jersey|Middlesex County |311.0  |2768.0     |860807      |
|3              |34019        |New Jersey|Hunterdon County |430.0  |302.0      |129924      |
|4              |34031        |New Jersey|Passaic County   |185.0  |2801.0     |518117      |
|5              |34037        |New Jersey|Sussex County    |521.0  |279.0      |145543      |
+---------------+-------------+----------+-----------------+-------+-----------+------------+
only showing top 5 rows



### 2. Eliminar áreas negativas -1

In [190]:
# Corregir los valores negativos en la columna AreaAct
from pyspark.sql.functions import when, col

df_geografia = df_geografia.withColumn(
    "AreaAct",
    when(col("AreaAct") < 0, col("AreaAct") * -1).otherwise(col("AreaAct"))
)

# Mostrar los primeros 5 registros transformados
df_geografia.show(5, truncate=False)


+---------------+-------------+----------+-----------------+-------+-----------+------------+
|IdGeografia_DWH|IdGeografia_T|Estado    |Condado          |AreaAct|DensidadAct|PoblacionAct|
+---------------+-------------+----------+-----------------+-------+-----------+------------+
|1              |34005        |New Jersey|Burlington County|805.0  |577.0      |464269      |
|2              |34023        |New Jersey|Middlesex County |311.0  |2768.0     |860807      |
|3              |34019        |New Jersey|Hunterdon County |430.0  |302.0      |129924      |
|4              |34031        |New Jersey|Passaic County   |185.0  |2801.0     |518117      |
|5              |34037        |New Jersey|Sussex County    |521.0  |279.0      |145543      |
+---------------+-------------+----------+-----------------+-------+-----------+------------+
only showing top 5 rows



In [197]:
from pyspark.sql.functions import col

# Filtrar las filas con áreas negativas
areas_negativas = df_geografia.filter(col("AreaAct") < 0)

# Contar el número de filas con áreas negativas
num_areas_negativas = areas_negativas.count()

# Imprimir resultados
if num_areas_negativas > 0:
    print(f"Se encontraron {num_areas_negativas} áreas negativas:")
    areas_negativas.show()
else:
    print("No se encontraron áreas negativas.")


No se encontraron áreas negativas.


### 3. Errores en la población 

In [194]:
# Filtrar registros donde PoblacionAct termine en "0001"
df_invalid_poblacion = df_geografia.filter(col("PoblacionAct").like("%0001"))

# Mostrar los registros con PoblacionAct inválido
df_invalid_poblacion.show(truncate=False)

# Contar los registros con valores inválidos
count_invalid = df_invalid_poblacion.count()
if count_invalid > 0:
    print(f"Se encontraron {count_invalid} registros con PoblacionAct terminando en '0001'.")
else:
    print("No se encontraron registros con PoblacionAct terminando en '0001'.")


+---------------+-------------+-----------+--------------------+-------+-----------+------------+
|IdGeografia_DWH|IdGeografia_T|Estado     |Condado             |AreaAct|DensidadAct|PoblacionAct|
+---------------+-------------+-----------+--------------------+-------+-----------+------------+
|2647           |51109        |Virginia   |Louisa County       |498.0  |78.0       |38848.0001  |
|2648           |30045        |Montana    |Judith Basin County |1870.0 |1.0        |2044.0001   |
|2649           |28111        |Mississippi|Perry County        |647.0  |18.0       |11571.0001  |
|2650           |17149        |Illinois   |Pike County         |831.0  |18.0       |14618.0001  |
|2651           |18099        |Indiana    |Marshall County     |444.0  |104.0      |46121.0001  |
|2652           |12095        |Florida    |Orange County       |908.0  |1567.0     |1422746.0001|
|2653           |48273        |Texas      |Kleberg County      |871.0  |35.0       |30635.0001  |
|2654           |230

In [195]:
from pyspark.sql.functions import col, when, floor

# Corregir los valores de PoblacionAct con ".0001" al eliminar los últimos cuatro dígitos
df_geografia = df_geografia.withColumn(
    "PoblacionAct",
    when(col("PoblacionAct").like("%.0001"), floor(col("PoblacionAct") / 10000))
    .otherwise(col("PoblacionAct"))
)

# Validar la transformación
df_geografia.show(20, truncate=False)


+---------------+-------------+----------+-----------------+-------+-----------+------------+
|IdGeografia_DWH|IdGeografia_T|Estado    |Condado          |AreaAct|DensidadAct|PoblacionAct|
+---------------+-------------+----------+-----------------+-------+-----------+------------+
|1              |34005        |New Jersey|Burlington County|805.0  |577.0      |464269      |
|2              |34023        |New Jersey|Middlesex County |311.0  |2768.0     |860807      |
|3              |34019        |New Jersey|Hunterdon County |430.0  |302.0      |129924      |
|4              |34031        |New Jersey|Passaic County   |185.0  |2801.0     |518117      |
|5              |34037        |New Jersey|Sussex County    |521.0  |279.0      |145543      |
|6              |34035        |New Jersey|Somerset County  |305.0  |1133.0     |345647      |
|7              |21041        |Kentucky  |Carroll County   |130.0  |84.0       |10863       |
|8              |12031        |Florida   |Duval County     |

### 4. Validar si existen duplicados para IdGeografia_T

In [200]:
# Verificar si existen valores nulos en las columnas relevantes
columns_to_check = ["IdGeografia_T", "Estado", "Condado", "AreaAct", "DensidadAct", "PoblacionAct"]

for col_name in columns_to_check:
    null_count = df_geografia.filter(col(col_name).isNull()).count()
    if null_count > 0:
        print(f"Se encontraron {null_count} valores nulos en la columna {col_name}.")
    else:
        print(f"No se encontraron valores nulos en la columna {col_name}.")


No se encontraron valores nulos en la columna IdGeografia_T.
No se encontraron valores nulos en la columna Estado.
No se encontraron valores nulos en la columna Condado.
Se encontraron 59 valores nulos en la columna AreaAct.
Se encontraron 59 valores nulos en la columna DensidadAct.
No se encontraron valores nulos en la columna PoblacionAct.


In [201]:
from pyspark.sql.functions import col

# Filtrar registros donde las columnas AreaAct o DensidadAct son nulas
nulos_area_densidad = df_geografia.filter(col("AreaAct").isNull() | col("DensidadAct").isNull())

# Contar y mostrar registros con valores nulos
nulos_area_densidad.show(truncate=False)
print(f"Total de registros con valores nulos en AreaAct o DensidadAct: {nulos_area_densidad.count()}")


+---------------+-------------+--------+---------------------+-------+-----------+------------+
|IdGeografia_DWH|IdGeografia_T|Estado  |Condado              |AreaAct|DensidadAct|PoblacionAct|
+---------------+-------------+--------+---------------------+-------+-----------+------------+
|71             |51510        |Virginia|Alexandria City      |NULL   |NULL       |158309      |
|216            |51650        |Virginia|Hampton City         |NULL   |NULL       |135169      |
|219            |51735        |Virginia|Poquoson City        |NULL   |NULL       |12121       |
|220            |51830        |Virginia|Williamsburg City    |NULL   |NULL       |15034       |
|221            |51710        |Virginia|Norfolk City         |NULL   |NULL       |244300      |
|222            |51700        |Virginia|Newport News City    |NULL   |NULL       |179582      |
|452            |51670        |Virginia|Hopewell City        |NULL   |NULL       |22500       |
|453            |51570        |Virginia|

In [204]:
from pyspark.sql.functions import mean, col, when

# Calcular los promedios por Estado para AreaAct y DensidadAct
promedios_por_estado = df_geografia.groupBy("Estado").agg(
    mean("AreaAct").alias("PromedioAreaEstado"),
    mean("DensidadAct").alias("PromedioDensidadEstado")
)

# Unir los promedios calculados al DataFrame original
df_geografia = df_geografia.join(promedios_por_estado, on="Estado", how="left")

# Rellenar los valores nulos en AreaAct y DensidadAct usando los promedios por Estado
df_geografia = df_geografia.withColumn(
    "AreaAct",
    when(col("AreaAct").isNull(), col("PromedioAreaEstado")).otherwise(col("AreaAct"))
)

df_geografia = df_geografia.withColumn(
    "DensidadAct",
    when(col("DensidadAct").isNull(), col("PromedioDensidadEstado")).otherwise(col("DensidadAct"))
)

# Eliminar las columnas de promedios temporales
df_geografia = df_geografia.drop("PromedioAreaEstado", "PromedioDensidadEstado")

# Validar los cambios
df_geografia.show(5, truncate=False)


+----------+---------------+-------------+-----------------+-------+-----------+------------+
|Estado    |IdGeografia_DWH|IdGeografia_T|Condado          |AreaAct|DensidadAct|PoblacionAct|
+----------+---------------+-------------+-----------------+-------+-----------+------------+
|New Jersey|1              |34005        |Burlington County|805.0  |577.0      |464269      |
|New Jersey|2              |34023        |Middlesex County |311.0  |2768.0     |860807      |
|New Jersey|3              |34019        |Hunterdon County |430.0  |302.0      |129924      |
|New Jersey|4              |34031        |Passaic County   |185.0  |2801.0     |518117      |
|New Jersey|5              |34037        |Sussex County    |521.0  |279.0      |145543      |
+----------+---------------+-------------+-----------------+-------+-----------+------------+
only showing top 5 rows



## Load

## Dimensión Areas de Servicio


AJUSTAR EL BLOQUE DEL DISENO PARA EL NUEVO MODELO DADO EN CLASE


### Extraction

## Transformation


## Load


#### Validar la Tabla Rs_AreasDeServicio

In [184]:
# Probar si la conexión con la tabla `Rs_AreasDeServicio` funciona correctamente
sql_areas_servicio_test = '''
(SELECT * 
 FROM Estudiante_8_202415.Rs_AreasDeServicio
 LIMIT 5) AS Rs_AreasDeServicio_Test
'''

try:
    areas_servicio_test = conn_orig.get_dataframe(sql_areas_servicio_test)
    print("Conexión exitosa con la tabla Rs_AreasDeServicio:")
    areas_servicio_test.show(truncate=False)
except Exception as e:
    print("Error al conectar con la tabla Rs_AreasDeServicio:")
    print(e)

sql_describe_table = '''
(SELECT COLUMN_NAME 
 FROM INFORMATION_SCHEMA.COLUMNS 
 WHERE TABLE_SCHEMA = 'Estudiante_8_202415'
   AND TABLE_NAME = 'Rs_AreasDeServicio') AS ColumnsInfo
'''

try:
    columns_info = conn_orig.get_dataframe(sql_describe_table)
    print("Columnas de la tabla Rs_AreasDeServicio:")
    columns_info.show(truncate=False)
except Exception as e:
    print("Error al listar columnas de la tabla Rs_AreasDeServicio:")
    print(e)


Conexión exitosa con la tabla Rs_AreasDeServicio:
+--------------------+------------------+---------------------------------------------+------------+
|IdAreaDeServicio_DWH|IdAreaDeServicio_T|Nombre                                       |AnnoCreacion|
+--------------------+------------------+---------------------------------------------+------------+
|1                   |100622017         |New Jersey - Medical91661NJ2340003-0520174859|2017        |
|2                   |100722019         |New Jersey - Medical91661NJ2340003-0520194597|2019        |
|3                   |100922020         |New Jersey - Medical91661NJ2340003-0520204858|2020        |
|4                   |101012018         |New Jersey - Medical91661NJ2340003-0520184611|2018        |
|5                   |101062020         |New Jersey - Medical91661NJ2340003-0520205037|2020        |
+--------------------+------------------+---------------------------------------------+------------+

Columnas de la tabla Rs_AreasDeServicio:

#### Validar la Tabla Rs_AsociacionAreaServicioGeografia

In [29]:
# Probar la conexión con la tabla Rs_AsociacionAreaServicioGeografia
sql_asociacion_test = '''
(SELECT * 
 FROM Estudiante_8_202415.Rs_AsociacionAreaServicioGeografia
 LIMIT 5) AS Rs_AsociacionAreaServicioGeografia_Test
'''

try:
    asociacion_test = conn_orig.get_dataframe(sql_asociacion_test)
    print("Conexión exitosa con la tabla Rs_AsociacionAreaServicioGeografia:")
    asociacion_test.show(truncate=False)
except Exception as e:
    print("Error al conectar con la tabla Rs_AsociacionAreaServicioGeografia:")
    print(e)

sql_describe_table = '''
(SELECT COLUMN_NAME 
 FROM INFORMATION_SCHEMA.COLUMNS 
 WHERE TABLE_SCHEMA = 'Estudiante_8_202415'
   AND TABLE_NAME = 'Rs_AsociacionAreaServicioGeografia') AS ColumnsInfo
'''

try:
    columns_info = conn_orig.get_dataframe(sql_describe_table)
    print("Columnas de la tabla Rs_AsociacionAreaServicioGeografia:")
    columns_info.show(truncate=False)
except Exception as e:
    print("Error al listar columnas de la tabla Rs_AsociacionAreaServicioGeografia:")
    print(e)


Conexión exitosa con la tabla Rs_AsociacionAreaServicioGeografia:
+--------------------+---------------+
|IdAreaDeServicio_DWH|IdGeografia_DWH|
+--------------------+---------------+
+--------------------+---------------+

Columnas de la tabla Rs_AsociacionAreaServicioGeografia:
+--------------------+
|COLUMN_NAME         |
+--------------------+
|IdAreaDeServicio_DWH|
|IdGeografia_DWH     |
+--------------------+



#### Validar la Tabla Rs_AsociacionAreaServicioGeografia

In [30]:
# Probar la conexión con la tabla Rs_Geografia
sql_geografia_test = '''
(SELECT * 
 FROM Estudiante_8_202415.Rs_Geografia
 LIMIT 5) AS Rs_Geografia_Test
'''

try:
    geografia_test = conn_orig.get_dataframe(sql_geografia_test)
    print("Conexión exitosa con la tabla Rs_Geografia:")
    geografia_test.show(truncate=False)
except Exception as e:
    print("Error al conectar con la tabla Rs_Geografia:")
    print(e)

# Alternativa: Obtener información sobre las columnas de Rs_Geografia
sql_describe_geografia = '''
(SELECT COLUMN_NAME 
 FROM INFORMATION_SCHEMA.COLUMNS 
 WHERE TABLE_SCHEMA = 'Estudiante_8_202415'
   AND TABLE_NAME = 'Rs_Geografia') AS ColumnsInfo
'''

try:
    columns_info_geografia = conn_orig.get_dataframe(sql_describe_geografia)
    print("Columnas de la tabla Rs_Geografia:")
    columns_info_geografia.show(truncate=False)
except Exception as e:
    print("Error al listar columnas de la tabla Rs_Geografia:")
    print(e)


Conexión exitosa con la tabla Rs_Geografia:
+---------------+-------------+------+-------+-------+-----------+------------+
|IdGeografia_DWH|IdGeografia_T|Estado|Condado|AreaAct|DensidadAct|PoblacionAct|
+---------------+-------------+------+-------+-------+-----------+------------+
+---------------+-------------+------+-------+-------+-----------+------------+

Columnas de la tabla Rs_Geografia:
+---------------+
|COLUMN_NAME    |
+---------------+
|AreaAct        |
|Condado        |
|DensidadAct    |
|Estado         |
|IdGeografia_DWH|
|IdGeografia_T  |
|PoblacionAct   |
+---------------+



# AsociacionAreaServicioGeografia

### Extracción de datos de AsociacionAreaServicioGeografia

In [181]:
# Extracción desde Rs_AreasDeServicio
sql_cargar_areas_servicio = '''
(SELECT 
    IdAreaDeServicio_DWH,
    IdAreaDeServicio_T
 FROM Estudiante_8_202415.Rs_AreasDeServicio) AS AreasDeServicioMapeo
'''
df_areas_servicio_mapeo = conn_dest.get_dataframe(sql_cargar_areas_servicio)

# Ordenar por IdAreaDeServicio_DWH de menor a mayor
df_areas_servicio_mapeo_sorted = df_areas_servicio_mapeo.orderBy("IdAreaDeServicio_DWH", ascending=True)

print("Primeros 5 registros de mapeo IdAreaDeServicio ordenados:")
df_areas_servicio_mapeo_sorted.show(5, truncate=False)


Primeros 5 registros de mapeo IdAreaDeServicio ordenados:
+--------------------+------------------+
|IdAreaDeServicio_DWH|IdAreaDeServicio_T|
+--------------------+------------------+
|1                   |100622017         |
|2                   |100722019         |
|3                   |100922020         |
|4                   |101012018         |
|5                   |101062020         |
+--------------------+------------------+
only showing top 5 rows



## Transformación 

### 1. Validar valores null

### 2. Revisar tipo de datos 

In [129]:
# Imprimir el esquema
df_asociacion_area_servicio_geografia.printSchema()


root
 |-- IdAreaDeServicio_DWH: integer (nullable = false)
 |-- IdGeografia_DWH: integer (nullable = false)

