In [8]:
import sys
sys.path.append('/home/armando/git/shorts/dimensional')

import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
from lib.data_profiler.data_profiler import TableProfiler


In [6]:
#SUBSISTEMA 01 - EXTRACCION - PERFILAMIENTO DE DATOS
data = pd.DataFrame({
    'user': ['Usuario 1', 'Usuario 2', 'Usuario 3', 'Usuario 4'],
    'age': [25.0, 30.0, 22.0, 35.0],
    'weight': [70.5, 80.2, 65.0, 90.3],
    'sex': ['M', 'F', 'F', 'M']
})

table_profiler = TableProfiler(data)
profile = table_profiler.profile()
print(profile)

{'user': {'unique_count': 4, 'unique_values': ['Usuario 1', 'Usuario 2', 'Usuario 3', 'Usuario 4']}, 'age': {'mean': np.float64(28.0), 'min': np.float64(22.0), 'max': np.float64(35.0)}, 'weight': {'mean': np.float64(76.5), 'min': np.float64(65.0), 'max': np.float64(90.3)}, 'sex': {'unique_count': 2, 'unique_values': ['M', 'F']}}


In [None]:
#SUBSISTEMA 02 - 

In [9]:
#SUBSISTEMA 09 - ENTREGA - GESTOR DE CAMBIOS LENTOS EN DIMENSIONES
class AbstractReadTable(ABC):

    @abstractmethod
    def read_table(self, table_name: str) -> pd.DataFrame:
        pass

class ReadTableDummy(AbstractReadTable):
    
    def read_table(self, table_name: str) -> pd.DataFrame:
        
        if table_name == 'dim_vendor':

            dim_vendor = pd.read_csv('/home/armando/git/shorts/dimensional/data/dim_vendor.csv')
            dim_vendor.loc[1, 'vendor_name'] = 'Libros Osom'
            dim_vendor['created_at'] = pd.to_datetime('2025-10-01 01:00:00')
            dim_vendor['updated_at'] = None
            dim_vendor['deleted_at'] = None
            dim_vendor['altered_at'] = pd.to_datetime('2025-10-01 01:00:00')

            return dim_vendor
        
        elif table_name == 'dim_vendor_dwh':

            dim_vendor_dwh = pd.read_csv('/home/armando/git/shorts/dimensional/data/dim_vendor.csv')
            dim_vendor_dwh['effective_start_date'] = pd.to_datetime('1900-01-01 00:00:00')
            dim_vendor_dwh['effective_end_date'] = pd.to_datetime('2100-01-01 00:00:00')
            dim_vendor_dwh['current_flag'] = 'Current'

            return dim_vendor_dwh
        
        elif table_name == 'dim_vendor_dwh_most_recent_surrogate_key':

            most_recent_surrogate_key = pd.read_csv('/home/armando/git/shorts/dimensional/data/dim_vendor.csv')
            most_recent_surrogate_key = most_recent_surrogate_key[['id_vendor', 'vendor_name']]
            return most_recent_surrogate_key

        else:
            raise ValueError(f"Table '{table_name}' not found.")

In [10]:
events = [{'event_id': 'E001', 'event': 'update', 'table': 'dim_vendor', 
           'values': {'vendor_name': 'Librosom', 'vendor_city': 'Monterrey',
                      'vendor_status': 'Activo'}}, 
          {'event_id': 'E002', 'event': 'insert', 'table': 'dim_vendor', 
           'values': {'vendor_name': 'Libreria Central', 'vendor_city': 'Guadalajara',
                      'vendor_status': 'Activo'}},
          {'event_id': 'E003', 'event': 'update', 'table': 'dim_vendor', 
           'values': {'vendor_name': 'Editores Unidos', 'vendor_city': 'Tijuana',
                      'vendor_status': 'Activo'}},
          {'event_id': 'E004', 'event': 'update', 'table': 'dim_vendor', 
           'values': {'vendor_name': 'Casa Bajio', 'vendor_city': 'León',
                      'vendor_status': 'Activo'}}
         ]

In [15]:
class AbstractSlowlyChangingDimensionHandler(ABC):

    @abstractmethod
    def handle_scd(self, events: list) -> pd.DataFrame:
        pass

class SlowlyChangingDimensionHandler(AbstractSlowlyChangingDimensionHandler):

    dimension = None
    source_table = None
    dwh_table = None
    surrogate_key_table = None

    new_dwh_table = None
    new_surrogate_key_table = None

    def __init__(self, dimension):
        self.dimension = dimension

    def read_tables(self):
        table_reader = ReadTableDummy()

        if self.dimension == 'dim_vendor':

            self.source_table = table_reader.read_table('dim_vendor')
            display(self.source_table.head(5))

            self.dwh_table = table_reader.read_table('dim_vendor_dwh')
            display(self.dwh_table.head(5))

            self.surrogate_key_table = table_reader.read_table('dim_vendor_dwh_most_recent_surrogate_key')
            display(self.surrogate_key_table.head(5))

    def handle_scd(self, events: list) -> None:

        for event in events:
            if event['event'] == 'insert':
                max_id = self.dwh_table['id_vendor'].max()
                new_id = max_id + 1
                new_record = pd.DataFrame({
                    'id_vendor': [new_id],
                    'vendor_name': [event['values']['vendor_name']],
                    'vendor_city': [event['values']['vendor_city']],
                    'vendor_status': [event['values']['vendor_status']],
                    'effective_start_date': [pd.to_datetime('1990-01-01 00:00:00')],
                    'effective_end_date': [pd.to_datetime('2100-01-01 00:00:00')],
                    'current_flag': ['Current'],
                })

                self.dwh_table = pd.concat([self.dwh_table, new_record], ignore_index=True)

            elif event['event'] == 'update':

                natural_key = event['values']['vendor_name']
                index = self.surrogate_key_table.loc[self.surrogate_key_table['vendor_name'] == natural_key].index
                current_surrogate_key = self.surrogate_key_table.loc[index, 'id_vendor'].values[0]
                index = self.dwh_table.loc[self.dwh_table['id_vendor'] == current_surrogate_key].index

                if event['event_id'] in ['E003', 'E004']:
                    
                    self.dwh_table.loc[index, 'effective_end_date'] = pd.to_datetime('2025-10-01 01:00:00')
                    self.dwh_table.loc[index, 'current_flag'] = 'Expired'

                    max_id = self.dwh_table['id_vendor'].max()
                    new_id = max_id + 1
                    new_record = pd.DataFrame({
                        'id_vendor': [new_id],
                        'vendor_name': [event['values']['vendor_name']],
                        'vendor_city': [event['values']['vendor_city']],
                        'vendor_status': [event['values']['vendor_status']],
                        'effective_start_date': [pd.to_datetime('2025-10-01 01:00:00')],
                        'effective_end_date': [pd.to_datetime('2100-01-01 00:00:00')],
                        'current_flag': ['Current'],
                    })

                    self.dwh_table = pd.concat([self.dwh_table, new_record], ignore_index=True)

                else:
                    
                    self.dwh_table.loc[index, 'vendor_city'] = event['values']['vendor_city']
                    self.dwh_table.loc[index, 'vendor_status'] = event['values']['vendor_status']
        
        self.new_dwh_table = self.dwh_table
        self.new_surrogate_key_table = self.dwh_table.loc[self.dwh_table['current_flag'] == 'Current']
        self.new_surrogate_key_table = self.new_surrogate_key_table[['id_vendor', 'vendor_name']]

In [16]:
scd_handler = SlowlyChangingDimensionHandler('dim_vendor')
scd_handler.read_tables()
scd_handler.handle_scd(events)

display(scd_handler.new_dwh_table.head(10))
display(scd_handler.new_surrogate_key_table.head(10))

Unnamed: 0,id_vendor,vendor_name,vendor_city,vendor_status,created_at,updated_at,deleted_at,altered_at
0,1,Casa Bajio,Ciudad de México,Activo,2025-10-01 01:00:00,,,2025-10-01 01:00:00
1,2,Libros Osom,Guadalajara,Activo,2025-10-01 01:00:00,,,2025-10-01 01:00:00
2,3,Librosom,Monterrey,Inactivo,2025-10-01 01:00:00,,,2025-10-01 01:00:00


Unnamed: 0,id_vendor,vendor_name,vendor_city,vendor_status,effective_start_date,effective_end_date,current_flag
0,1,Casa Bajio,Ciudad de México,Activo,1900-01-01,2100-01-01,Current
1,2,Editores Unidos,Guadalajara,Activo,1900-01-01,2100-01-01,Current
2,3,Librosom,Monterrey,Inactivo,1900-01-01,2100-01-01,Current


Unnamed: 0,id_vendor,vendor_name
0,1,Casa Bajio
1,2,Editores Unidos
2,3,Librosom


Unnamed: 0,id_vendor,vendor_name,vendor_city,vendor_status,effective_start_date,effective_end_date,current_flag
0,1,Casa Bajio,Ciudad de México,Activo,1900-01-01 00:00:00,2025-10-01 01:00:00,Expired
1,2,Editores Unidos,Guadalajara,Activo,1900-01-01 00:00:00,2025-10-01 01:00:00,Expired
2,3,Librosom,Monterrey,Activo,1900-01-01 00:00:00,2100-01-01 00:00:00,Current
3,4,Libreria Central,Guadalajara,Activo,1990-01-01 00:00:00,2100-01-01 00:00:00,Current
4,5,Editores Unidos,Tijuana,Activo,2025-10-01 01:00:00,2100-01-01 00:00:00,Current
5,6,Casa Bajio,León,Activo,2025-10-01 01:00:00,2100-01-01 00:00:00,Current


Unnamed: 0,id_vendor,vendor_name
2,3,Librosom
3,4,Libreria Central
4,5,Editores Unidos
5,6,Casa Bajio
