# An√°lisis Exploratorio de Datos

## Descripci√≥n de variables 

* homicidio_key: Inidentificador √∫nico de cada hecho
* cod_depto: Llave secundaria que conecta con la dimensi√≥n departamento
* cod_mpio: Llave secundaria que conecta con la dimensi√≥n municipio
* sexo_key: Llave secundaria que conecta con la dimensi√≥n sexo
* fecha_hecho: Fecha en que ocurri√≥ el hecho
* Cantidad: Cantidad de victimas resgistradas en un mismo hecho

* source_id: Identificador √∫nico de la fuente de datos
* loaded_at: Fecha en que se carg√≥ el hecho



In [2]:
# Importar librer√≠as
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Comprobaci√≥n de la conexion al contenedor
print(f'Python: {sys.version}')
print(f'Ejecutando en: {sys.prefix}\n')


# Opciones de visualiazacion de tablas 
pd.options.display.max_columns = None
pd.options.display.max_rows = 100


# Conexi√≥n a la base de datos
load_dotenv('/app/.env')

DW_USER = os.getenv('DW_USER')
DW_PASSWORD = os.getenv('DW_PASSWORD')
DW_HOST = 'datawarehouse'
DW_PORT = '5432'
DW_DB = os.getenv('DW_DB')

connection_string = f"postgresql://{DW_USER}:{DW_PASSWORD}@{DW_HOST}:{DW_PORT}/{DW_DB}"
engine = create_engine(connection_string)

# Probar conexi√≥n
with engine.connect() as conn:
    result = pd.read_sql("SELECT current_database(), current_user;", conn)
    print("‚úÖ Conexi√≥n exitosa!")
    print(f"üìä Base de datos: {result.iloc[0, 0]}")
    print(f"üë§ Usuario: {result.iloc[0, 1]}")

Python: 3.12.12 (main, Nov 18 2025, 05:56:04) [GCC 14.2.0]
Ejecutando en: /usr/local

‚úÖ Conexi√≥n exitosa!
üìä Base de datos: homicidios_dw
üë§ Usuario: dw_user


### Obtenci√≥n de los Metadatos

In [3]:
# Obtener metadatos de las tablas (sin cargar datos)
query_tables = """
SELECT 
    t.table_name,
    COUNT(c.column_name) as num_columnas,
    pg_size_pretty(pg_total_relation_size(quote_ident(t.table_name)::regclass)) as tama√±o
FROM information_schema.tables t
LEFT JOIN information_schema.columns c 
    ON t.table_name = c.table_name 
    AND t.table_schema = c.table_schema
WHERE t.table_schema = 'public'
    AND t.table_type = 'BASE TABLE'
GROUP BY t.table_name
ORDER BY t.table_name;
"""

df_tables = pd.read_sql(query_tables, engine)
print("üìã Tablas en el Data Warehouse:\n")
print(df_tables.to_string(index=False))
print(f"\nüìä Total de tablas: {len(df_tables)}")

üìã Tablas en el Data Warehouse:

      table_name  num_columnas  tama√±o
dim_departamento             4   40 kB
       dim_fecha            12 1280 kB
   dim_municipio             6  232 kB
        dim_sexo             2   56 kB
         etl_log            10   48 kB
 fact_homicidios             9   58 MB

üìä Total de tablas: 6


In [4]:
# Obtener todas las columnas de todas las tablas
query_columns = """
SELECT 
    t.table_name,
    c.column_name,
    c.data_type,
    c.is_nullable,
    CASE 
        WHEN pk.column_name IS NOT NULL THEN 'PK'
        WHEN fk.column_name IS NOT NULL THEN 'FK'
        ELSE ''
    END as key_type
FROM information_schema.tables t
JOIN information_schema.columns c 
    ON t.table_name = c.table_name
LEFT JOIN (
    SELECT ku.table_name, ku.column_name
    FROM information_schema.table_constraints tc
    JOIN information_schema.key_column_usage ku
        ON tc.constraint_name = ku.constraint_name
        AND tc.table_schema = ku.table_schema
    WHERE tc.constraint_type = 'PRIMARY KEY'
        AND tc.table_schema = 'public'
) pk ON c.table_name = pk.table_name AND c.column_name = pk.column_name
LEFT JOIN (
    SELECT ku.table_name, ku.column_name
    FROM information_schema.table_constraints tc
    JOIN information_schema.key_column_usage ku
        ON tc.constraint_name = ku.constraint_name
        AND tc.table_schema = ku.table_schema
    WHERE tc.constraint_type = 'FOREIGN KEY'
        AND tc.table_schema = 'public'
) fk ON c.table_name = fk.table_name AND c.column_name = fk.column_name
WHERE t.table_schema = 'public'
    AND t.table_type = 'BASE TABLE'
ORDER BY t.table_name, c.ordinal_position;
"""

df_columns = pd.read_sql(query_columns, engine)

# Mostrar columnas agrupadas por tabla
print("ESTRUCTURA COMPLETA DEL DATA WAREHOUSE")
print("="*80)

for table in df_columns['table_name'].unique():
    print(f"\nTabla: {table}")
    print("="*80)
    df_table = df_columns[df_columns['table_name'] == table][
        ['column_name', 'data_type', 'is_nullable', 'key_type']
    ]
    print(df_table.to_string(index=False))
    print(f"\nTotal de columnas: {len(df_table)}")

ESTRUCTURA COMPLETA DEL DATA WAREHOUSE

Tabla: dim_departamento
column_name         data_type is_nullable key_type
  cod_depto           integer          NO       PK
  nom_depto character varying          NO         
    latitud           numeric         YES         
   longitud           numeric         YES         

Total de columnas: 4

Tabla: dim_fecha
      column_name         data_type is_nullable key_type
        fecha_key           integer          NO       PK
            fecha              date          NO         
              a√±o          smallint          NO         
              mes          smallint          NO         
              dia          smallint          NO         
        trimestre          smallint          NO         
       semana_a√±o          smallint          NO         
       dia_semana          smallint          NO         
       nombre_mes character varying          NO         
nombre_dia_semana character varying          NO         
    es_fin_s

## Lectura de las tablas

In [5]:
# Dim Departamento
df_dim_fecha = pd.read_sql('''
SELECT 
    cod_depto,
    nom_depto,
    latitud
    longitud
FROM dim_departamento
''', engine)

df_dim_fecha.head()

Unnamed: 0,cod_depto,nom_depto,longitud
0,5,ANTIOQUIA,6.702032
1,8,ATL√ÅNTICO,10.67701
2,11,"BOGOT√Å, D.C.",4.316108
3,13,BOL√çVAR,8.079797
4,15,BOYAC√Å,5.891673


## Letura de datos 

In [None]:
# Dim Municipio 
df_dim_municipio = pd.read_sql("SELECT * FROM dim_municipio", engine)

# Dim Fecha 
df_dim_fecha = pd.read_sql("SELECT * FROM dim_fecha", engine)

# Dim Sexo
df_dim_sexo = pd.read_sql('''
SELECT
    sexo_key,
    sexo
FROM dim_sexo
''', engine)


# Fact Homicidios
df_fact_homicidios = pd.read_sql('''
SELECT
    homicidio_key,
    fecha_key
    cod_depto,
    cod_depto,
    cod_mpio,
    sexo_key,
    zona,
    cantidad,
    source_id,
    loaded_at
FROM fact_homicidios
''', engine)


In [13]:
# Visualizaci√≥n de los primeros registros
df_fact_homicidios.head()

Unnamed: 0,homicidio_key,cod_depto,cod_depto.1,cod_mpio,sexo_key,zona,cantidad,source_id,loaded_at
0,1,1,11,11001,5,URBANA,1,1,2025-11-25 01:00:11.898315
1,2,1,11,11001,5,URBANA,1,2,2025-11-25 01:00:11.898315
2,3,1,11,11001,5,URBANA,1,3,2025-11-25 01:00:11.898315
3,4,1,11,11001,5,URBANA,1,4,2025-11-25 01:00:11.898315
4,5,1,11,11001,5,URBANA,1,5,2025-11-25 01:00:11.898315


In [10]:
df_fact_homicidios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332131 entries, 0 to 332130
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   homicidio_key  332131 non-null  int64         
 1   cod_depto      332131 non-null  int64         
 2   cod_depto      332131 non-null  int64         
 3   cod_mpio       332131 non-null  int64         
 4   sexo_key       332131 non-null  int64         
 5   zona           332131 non-null  object        
 6   cantidad       332131 non-null  int64         
 7   source_id      332131 non-null  int64         
 8   loaded_at      332131 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(7), object(1)
memory usage: 22.8+ MB


In [15]:
df_fact_homicidios['cantidad'].describe()

count    332131.000000
mean          1.001927
std           0.058503
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          14.000000
Name: cantidad, dtype: float64