# Generación de tablas Hive

Esta notebook tiene como objetivo crear las tablas Hive para cada una de las estructuras parquet en la zona clean. Se van a crear external tables de Hive.

In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession \
    .builder \
    .appName("crear_tablas_hive") \
    .enableHiveSupport() \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2023-11-19T15:07:23,827 WARN [Thread-4] org.apache.hadoop.util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2023-11-19T15:07:25,257 WARN [Thread-4] org.apache.spark.util.Utils - Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Creación de base de datos

In [None]:
# Crear una nueva base de datos en Hive
spark.sql("CREATE DATABASE if not exists datalake")

# Listar bases de datos para verificar
spark.sql("SHOW DATABASES").show()

In [None]:
spark.sql("use datalake")

### Creación de tabla de usuarios

In [6]:
spark.sql("""
CREATE EXTERNAL TABLE usuarios (
    id_usuario STRING,
    nombre STRING,
    apellido STRING,
    email STRING,
    sexo  STRING,
    pais STRING,
    actividad STRING,
    subscripcion STRING,
    fecha_subs DATE,
    fecha_renov DATE, 
    ultimo_pago DATE
)
STORED AS PARQUET
LOCATION '/datalake/clean/usuarios';
""")

2023-11-19T13:48:52,204 INFO [Thread-4] org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAccessController - Created SQLStdHiveAccessController for session context : HiveAuthzSessionContext [sessionString=4fcd67ca-c0ab-435e-ba3a-5c6f3b4c2192, clientType=HIVECLI]
2023-11-19T13:48:52,207 WARN [Thread-4] org.apache.hadoop.hive.ql.session.SessionState - METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
2023-11-19T13:48:52,207 INFO [Thread-4] hive.metastore - Mestastore configuration hive.metastore.filter.hook changed from org.apache.hadoop.hive.metastore.DefaultMetaStoreFilterHookImpl to org.apache.hadoop.hive.ql.security.authorization.plugin.AuthorizationMetaStoreFilterHook
2023-11-19T13:48:52,212 INFO [Thread-4] hive.metastore - Closed a connection to metastore, current connections: 0
2023-11-19T13:48:52,214 INFO [Thread-4] hive.metastore - Trying to connect to metastore with URI thrift://l

DataFrame[]

### Creación de tabla de actividades

In [3]:
df_actividades = spark.read.parquet("/datalake/clean/actividades")

                                                                                

In [4]:
df_actividades.printSchema()

root
 |-- id_actividad: string (nullable = true)
 |-- id_usuario: string (nullable = true)
 |-- id_dispositivo: string (nullable = true)
 |-- id_plan: string (nullable = true)
 |-- tipo_actividad: string (nullable = true)
 |-- pais: string (nullable = true)
 |-- longitud: float (nullable = true)
 |-- latitud: float (nullable = true)
 |-- duracion: integer (nullable = true)
 |-- fecha_hora: timestamp (nullable = true)



In [6]:
spark.sql("""
CREATE EXTERNAL TABLE actividades (
    id_actividad STRING,
    id_usuario STRING,
    id_dispositivo STRING,
    id_plan STRING,
    tipo_actividad STRING,
    pais STRING,
    longitud FLOAT,
    latitud FLOAT,
    duracion INT,
    fecha_hora TIMESTAMP
)
STORED AS PARQUET
LOCATION '/datalake/clean/actividades';
""")

2023-11-19T15:08:12,246 INFO [Thread-4] org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAccessController - Created SQLStdHiveAccessController for session context : HiveAuthzSessionContext [sessionString=2930b55b-4b2e-44da-8728-b7ada1ce8a93, clientType=HIVECLI]
2023-11-19T15:08:12,249 WARN [Thread-4] org.apache.hadoop.hive.ql.session.SessionState - METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
2023-11-19T15:08:12,249 INFO [Thread-4] hive.metastore - Mestastore configuration hive.metastore.filter.hook changed from org.apache.hadoop.hive.metastore.DefaultMetaStoreFilterHookImpl to org.apache.hadoop.hive.ql.security.authorization.plugin.AuthorizationMetaStoreFilterHook
2023-11-19T15:08:12,256 INFO [Thread-4] hive.metastore - Closed a connection to metastore, current connections: 0
2023-11-19T15:08:12,258 INFO [Thread-4] hive.metastore - Trying to connect to metastore with URI thrift://l

DataFrame[]

In [7]:
spark.sql("select * from actividades").show()

                                                                                

+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------+--------+--------+-------------------+
|        id_actividad|          id_usuario|      id_dispositivo|             id_plan|tipo_actividad|          pais|longitud| latitud|duracion|         fecha_hora|
+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------+--------+--------+-------------------+
|d2b9f5a4-e457-406...|62ae8127-7584-4fe...|5f6742cf-0cc3-499...|1a257404-2741-4d5...|      gimnasia|        Brazil|-47.9247|-38.5258|       1|2023-07-08 06:10:00|
|742ceebe-7cbd-42a...|bf91106f-2604-41a...|9c9d7ede-4915-4fd...|296f1010-fd8b-4fe...|      triatlon|        Russia|100.6432| 60.0272|       2|2023-03-01 02:16:00|
|f079c279-1249-40e...|5c8d1f57-d9d6-4e0...|af1e4b22-9836-4ff...|d3c2f38b-249f-40c...|         rugby|     Indonesia|112.6225| -7.7611|       3|2023-09-17 07:09:00|
|b8db20aa-9915-426...|

### Creación de tabla actividades_eventos

In [16]:
df_actividades_eventos = spark.read.parquet("/datalake/clean/actividades_eventos")

In [17]:
df_actividades_eventos.printSchema()

root
 |-- id_sensor: string (nullable = true)
 |-- pulse_rate: integer (nullable = true)
 |-- id_actividad: string (nullable = true)
 |-- longitud: float (nullable = true)
 |-- latitud: float (nullable = true)
 |-- fecha_hora: timestamp (nullable = true)



In [19]:
spark.sql("""
CREATE EXTERNAL TABLE actividades_eventos (
    id_sensor STRING,
    pulse_rate integer,
    id_actividad STRING,
    longitud FLOAT,
    latitud FLOAT,
    fecha_hora TIMESTAMP
)
STORED AS PARQUET
LOCATION '/datalake/clean/actividades_eventos';
""")

DataFrame[]

In [20]:
spark.sql("select * from actividades_eventos").show()

+--------------------+----------+--------------------+--------+--------+-------------------+
|           id_sensor|pulse_rate|        id_actividad|longitud| latitud|         fecha_hora|
+--------------------+----------+--------------------+--------+--------+-------------------+
|853dc9e4-6094-451...|       173|d2b9f5a4-e457-406...|-47.9247|-38.5258|2023-07-08 06:10:00|
|278b2f82-087a-469...|       128|742ceebe-7cbd-42a...|100.6432| 60.0272|2023-03-01 02:16:00|
|ea0b5fad-ff1d-491...|       148|f079c279-1249-40e...|112.6225| -7.7611|2023-09-17 07:09:00|
|f6f4ad28-30e2-4c8...|       110|b8db20aa-9915-426...|100.6432| 60.0272|2023-05-27 10:45:00|
|e63edcf3-eca8-412...|       170|59b61169-ecde-498...|-78.4163| 22.2258|2023-08-24 06:49:00|
|bab58192-fc85-49c...|        42|8ea971d8-bc3e-44c...|106.3456| 35.3371|2023-05-18 00:35:00|
|cafd29e6-3782-4fa...|       197|f0c4dc60-3cf3-4f9...|-47.9247|-38.5258|2023-05-14 09:20:00|
|ce9ed08c-ccdc-48d...|       198|32bf0c7b-f21a-44b...|-63.2369|-17.405

### Creación de tabla planes_de_entrenamiento

In [8]:
df_planes = spark.read.parquet("/datalake/clean/planes_de_entrenamiento")
df_planes.printSchema()

root
 |-- id_plan: string (nullable = true)
 |-- id_usuario: string (nullable = true)
 |-- tipo: string (nullable = true)
 |-- plan_duracion: float (nullable = true)
 |-- instrucciones: string (nullable = true)
 |-- objetivo: string (nullable = true)



In [9]:
spark.sql("DROP TABLE planes_de_entrenamiento")

DataFrame[]

In [10]:
spark.sql("""
CREATE EXTERNAL TABLE planes_de_entrenamiento (
    id_plan STRING,
    id_usuario STRING,
    tipo STRING,
    plan_duracion FLOAT,
    instrucciones STRING,
    objetivo STRING
)
STORED AS PARQUET
LOCATION '/datalake/clean/planes_de_entrenamiento';
""")

DataFrame[]

In [11]:
spark.sql("select * from planes_de_entrenamiento").show()

+--------------------+--------------------+----------------+-------------+--------------------+--------------------+
|             id_plan|          id_usuario|            tipo|plan_duracion|       instrucciones|            objetivo|
+--------------------+--------------------+----------------+-------------+--------------------+--------------------+
|1a257404-2741-4d5...|62ae8127-7584-4fe...|   media-maraton|         1.63|"Lorem ipsum dolo...|"Maecenas leo odi...|
|296f1010-fd8b-4fe...|bf91106f-2604-41a...|            hiit|         1.73|"Nulla ut erat id...|"Duis bibendum. M...|
|d3c2f38b-249f-40c...|5c8d1f57-d9d6-4e0...|condicionamiento|          1.3|"Proin leo odio p...|"Donec diam neque...|
|b206f2a3-5ddd-47a...|f8c1ed92-4786-4f9...|            otro|         1.25|"Cum sociis natoq...|"Quisque id justo...|
|147cd3b7-c8c6-4b6...|40613c17-e5e7-4d1...|            hiit|         1.67|"Duis consequat d...|"Maecenas leo odi...|
|0c384ba2-f2c6-4ca...|88d695a1-1c4a-48e...|        triatlon|    

### Creación de tabla dispositivos

In [24]:
df_planes = spark.read.parquet("/datalake/clean/dispositivos")
df_planes.printSchema()

root
 |-- id_dispositivo: string (nullable = true)
 |-- model: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- operating_system: string (nullable = true)



In [25]:
spark.sql("""
CREATE EXTERNAL TABLE dispositivos (
    id_dispositivo STRING,
    model STRING,
    brand STRING,
    operating_system STRING
)
STORED AS PARQUET
LOCATION '/datalake/clean/dispositivos';
""")

DataFrame[]

In [26]:
spark.sql("select * from dispositivos").show()

+--------------------+----------------+-------+----------------+
|      id_dispositivo|           model|  brand|operating_system|
+--------------------+----------------+-------+----------------+
|5f6742cf-0cc3-499...|   iPhone 12 Pro|  Apple|             iOS|
|9c9d7ede-4915-4fd...|         Pixel 5| Google|         Android|
|af1e4b22-9836-4ff...|    Xperia 1 III|   Sony|         Android|
|5d5aa38b-9e5e-412...|     Find X3 Pro|   Oppo|         Android|
|39e8db1f-0258-459...|          Velvet|     LG|         Android|
|6188ae01-946b-40e...|         X60 Pro|   Vivo|         Android|
|0b8d0acd-7391-449...|           9 Pro|OnePlus|         Android|
|cac63327-f1b8-4ba...|Galaxy S21 Ultra|Samsung|         Android|
|a8ec5dc7-b977-432...|           Mi 11| Xiaomi|         Android|
+--------------------+----------------+-------+----------------+

