In [7]:
import findspark
findspark.init('/opt/spark')  # Ruta donde está instalado Spark

from pyspark.sql import SparkSession

# Inicializar SparkSession
spark = SparkSession.builder \
    .appName("MedallionArchitecture") \
    .config("spark.master", "local") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .getOrCreate()

print("SparkSession iniciada.")

SparkSession iniciada.


In [2]:
# Ruta al archivo JSON en el contenedor
json_path = 'file:///workspace/events.json'  # 'file://' sistema de archivos local

# Leer el archivo JSON
df_eventos_json = spark.read.json(json_path)

# Mostrar los datos
df_eventos_json.show(truncate=False)

                                                                                

+-------------+-----------+-------------------+-----------+------------------+------------------+------------------------------------+-----------------+
|barrio_codigo|customer_id|date_event         |employee_id|latitude          |longitude         |order_id                            |quantity_products|
+-------------+-----------+-------------------+-----------+------------------+------------------+------------------------------------+-----------------+
|14           |3251       |20/06/2024 03:47:31|2232       |6.185923725950837 |-75.57327270670339|6a3081ca-bcda-48c0-8dad-300c9d84ec78|88               |
|14           |3346       |20/06/2024 03:48:01|5668       |6.188404683813707 |-75.5802148606929 |d29f3db8-5199-4f89-8a08-d2980ab61230|92               |
|60           |7515       |20/06/2024 03:48:31|6696       |6.281315718428238 |-75.66992697115182|63e60a1d-719f-4466-8bfb-fb67f950b3bc|81               |
|70           |9935       |20/06/2024 03:49:01|1473       |6.253404547201645 |-75.

In [3]:
# Mostrar el esquema del DataFrame
df_eventos_json.printSchema()

root
 |-- barrio_codigo: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- date_event: string (nullable = true)
 |-- employee_id: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- quantity_products: long (nullable = true)



In [4]:
#DataFrame en formato Parquet en HDFS en la capa Bronze
# Ruta donde se guardará el archivo Parquet para la capa Bronze en HDFS
bronze_path_hdfs = 'hdfs://localhost:9000/user/spark/bronze_layer'

# Escribir el DataFrame en formato Parquet en HDFS
df_eventos_json.write.mode("overwrite").parquet(bronze_path_hdfs)

print("Datos escritos en la capa Bronze en formato Parquet en HDFS.")

                                                                                

Datos escritos en la capa Bronze en formato Parquet en HDFS.


In [2]:
#Verificar los archivos Parquet en HDFS
from subprocess import Popen, PIPE

# Ejecutar el comando para listar archivos en HDFS
process = Popen(['hadoop', 'fs', '-ls', '/user/spark/bronze_layer'], stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()

# Mostrar la salida del comando
print(stdout.decode())

Found 2 items
-rw-r--r--   1 root supergroup          0 2024-06-20 05:44 /user/spark/bronze_layer/_SUCCESS
-rw-r--r--   1 root supergroup       2383 2024-06-20 05:44 /user/spark/bronze_layer/part-00000-93db9b4f-f058-46b1-9e60-608fce3b4d7c-c000.snappy.parquet



In [3]:
#Dejar las tablas en bronze con HIVE
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark SQL Hive Integration") \
    .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
# Leer los datos para verificar
df_eventos_bronze = spark.read.parquet('hdfs://localhost:9000/user/spark/bronze_layer')
df_eventos_bronze.show(truncate=False)

+-------------------+-----------+-----------+-----------------+------------------+------------------------------------+-----------------+
|Date               |customer_id|employee_id|latitude         |longitude         |order_id                            |quantity_products|
+-------------------+-----------+-----------+-----------------+------------------+------------------------------------+-----------------+
|2024-06-20 05:44:28|7494       |5668       |6.198870701056316|-75.67207346034631|bd55c434-0138-4c58-b02d-22e2dfc02dc0|26               |
+-------------------+-----------+-----------+-----------------+------------------+------------------------------------+-----------------+



In [11]:
# Crear la base de datos bronze si no existe
spark.sql("CREATE DATABASE IF NOT EXISTS DB_bronze")

DataFrame[]

In [12]:
# Guardar los datos como una tabla en la base de datos bronze
df_eventos_bronze.write.mode('overwrite').saveAsTable('DB_bronze.eventos')

In [13]:
# Verificar que la tabla está guardada en eventos_bronze
spark.sql("SELECT * FROM DB_bronze.eventos").show(truncate=False)

+-------------+-----------+-------------------+-----------+------------------+------------------+------------------------------------+-----------------+
|barrio_codigo|customer_id|date_event         |employee_id|latitude          |longitude         |order_id                            |quantity_products|
+-------------+-----------+-------------------+-----------+------------------+------------------+------------------------------------+-----------------+
|14           |3251       |20/06/2024 03:47:31|2232       |6.185923725950837 |-75.57327270670339|6a3081ca-bcda-48c0-8dad-300c9d84ec78|88               |
|14           |3346       |20/06/2024 03:48:01|5668       |6.188404683813707 |-75.5802148606929 |d29f3db8-5199-4f89-8a08-d2980ab61230|92               |
|60           |7515       |20/06/2024 03:48:31|6696       |6.281315718428238 |-75.66992697115182|63e60a1d-719f-4466-8bfb-fb67f950b3bc|81               |
|70           |9935       |20/06/2024 03:49:01|1473       |6.253404547201645 |-75.

In [None]:
# Definir la ruta base en HDFS para la capa Silver
silver_path_hdfs = 'hdfs://localhost:9000/user/spark/silver_layer/'

In [None]:
############ CAPA GOLD ########################

# Definir la ruta base en HDFS para la capa Gold
gold_path_hdfs = 'hdfs://localhost:9000/user/spark/gold_layer/'

In [None]:
import socket
import json

# Variables para almacenar el número de datos recibidos y el último dato
data_count = 0
last_data = {}

def receive_data(data):
    global data_count, last_data
    data_count += 1
    last_data = json.loads(data)
    print(f"Datos recibidos: {last_data}")

def create_server(host='0.0.0.0', port=5002):
    server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    server_socket.bind((host, port))
    server_socket.listen(5)
    print(f"Servidor escuchando en {host}:{port}")
    
    while True:
        client_socket, addr = server_socket.accept()
        print(f"Conexión establecida con {addr}")
        
        data = client_socket.recv(1024).decode('utf-8')
        if data:
            receive_data(data)
            response = json.dumps({"status": "success"})
            client_socket.sendall(response.encode('utf-8'))
        
        client_socket.close()

if __name__ == '__main__':
    create_server()