# Sectora Horizon
---
A Future-Focused Analytics System for Industries and Busisness.

In [1]:
from ingest.catalog import DatasetCatalog
from ingest.loader import RawDatasetLoader
from utils.uuid import generate_deterministic_id_name_based
from utils.clean import clean_text, is_numeric_string, normalize_code_to_length, normalize_text
from ingest.fetch.csv import CsvAdapter
from ingest.fetch.sct import SocrataAdapter
from ingest.fetch.excel import ExcelAdapter
import pandas as pd
import numpy as np
catalog = DatasetCatalog()
loader = RawDatasetLoader(
    csv_adapter=CsvAdapter(),
    sct_adapter=SocrataAdapter(),
    excel_adapter=ExcelAdapter(),
)

In [2]:
ds = catalog.get("master_csv")
records = list(loader.load(ds))
df = pd.DataFrame(records)

In [3]:
# El CSV debe tener columnas: 'year' y 'CPI' (índice anual)
cpi_ds = catalog.get("IPCanal")
records = list(loader.load(cpi_ds))
cpi = pd.DataFrame(records)
cpi

# normalizar nombres de columnas comunes
if 'Año de Corte'  in cpi.columns and 'year' not in cpi.columns:
    cpi = cpi.rename(columns={'Año de Corte':'year'})
if 'CPI' not in cpi.columns:
    # intentar detectar la columna de índice de precios
    possible = [c for c in cpi.columns if 'cpi' in c.lower() or 'ipc' in c.lower() or 'indice' in c.lower()]
    if possible:
        cpi = cpi.rename(columns={possible[0]:'CPI'})
cpi.columns = cpi.columns.map(lambda x: x.encode('utf-8').decode('utf-8-sig').strip())
print('CPI columns:', cpi.columns.tolist())
# merge
df = df.merge(cpi[['year','CPI']], on='year', how='left')
missing_cpi = df['CPI'].isnull().sum()
print(f'Missing CPI after merge: {missing_cpi} rows')


CPI columns: ['year', 'CPI']
Missing CPI after merge: 0 rows


In [None]:
# 2) Ajustar GANANCIA por inflación a precios del año base (último año disponible)
if 'CPI' in df.columns and df['CPI'].notna().any():
    # Limpiar CPI: convertir strings con comas a float (ej. '9,28' -> 9.28)
    def clean_cpi_value(val):
        if pd.isna(val):
            return pd.NA
        try:
            s = str(val).strip().replace(',', '.')
            return float(s)
        except Exception:
            return pd.NA
    df['CPI'] = df['CPI'].apply(clean_cpi_value)
    base_year = df['year'].max()
    base_cpi = df.loc[df['year'] == base_year, 'CPI'].dropna().unique()
    if len(base_cpi)==0:
        raise ValueError(f'No hay CPI para el año base {base_year}. Revisa el CSV de CPI.')
    base_cpi = float(clean_cpi_value(base_cpi[0]))
    # Convert temporalely 'ganancias' to numeric
    ganancias_num = df['ganancias'].astype(int)
    adjusted = ganancias_num * (base_cpi / df['CPI'])
    n_adjusted = int(adjusted.notna().sum())
    df['ganancias'] = adjusted
    print(f"Ajustadas {n_adjusted} filas de GANANCIA a precios del año {base_year}.")
else:
    print('No se puede ajustar por inflación: falta columna CPI.')

Ajustadas 40000 filas de GANANCIA a precios del año 2024.


In [None]:
# 3) Construir target t+1 y lags por NIT (ignorar gaps entre años)
df = df.sort_values(['nit','year']).reset_index(drop=True)

# target: GANANCIA_REAL del siguiente año disponible para cada NIT (ignora gaps)
df['ganancias_next'] = df.groupby('nit')['ganancias'].shift(-1)

# fix data type if required
if df['ingresos'].dtype in [str, object]:
    df['ingresos'] = df['ingresos'].astype(int)

# crear lags para ganacias e INGRESOS OPERACIONALES (todos los registros previos de la empresa)
for lag in range(1, 5):
    df[f'ganancia_lag_{lag}'] = df.groupby('nit')['ganancias'].shift(lag)
    df[f'ingresos_lag_{lag}'] = df.groupby('nit')['ingresos'].shift(lag)

# contar lags disponibles y crear indicador
df['n_ganancia_lags'] = df[[f'ganancia_lag_{i}' for i in range(1,5)]].notna().sum(axis=1)
df['n_ingresos_lags'] = df[[f'ingresos_lag_{i}' for i in range(1,5)]].notna().sum(axis=1)

# features adicionales: tasa de crecimiento 1 año (si existe lag_1) y media movil de 3 años
df['ganancia_grow_1y'] = df['ganancias'] / df['ganancia_lag_1'] - 1
df['ganancia_roll_mean_3'] = df.groupby('nit')['ganancias'].transform(lambda s: s.shift(1).rolling(3, min_periods=1).mean())

# preservar flag de outlier ya marcado
if 'is_outlier_kde' not in df.columns:
    df['is_outlier_kde'] = False

# Diagnóstico de target y lags
n_with_target = df['ganancias_next'].notna().sum()
n_with_lag1 = df['ganancia_lag_1'].notna().sum()
n_with_any_lag = (df['n_ganancia_lags'] > 0).sum()

print(f'Lags y target creados.')
print(f'  Filas con ganancias_next válido: {n_with_target}')
print(f'  Filas con ganancia_lag_1 válido: {n_with_lag1}')
print(f'  Filas con al menos 1 lag: {n_with_any_lag}')
print(f'  Distribución de n_ganancia_lags:')
print(df['n_ganancia_lags'].value_counts().sort_index())

Lags y target creados.
  Filas con ganancias_next válido: 26024
  Filas con ganancia_lag_1 válido: 26024
  Filas con al menos 1 lag: 26024
  Distribución de n_ganancia_lags:
n_ganancia_lags
0    13976
1    11026
2     8442
3     6556
Name: count, dtype: int64


In [6]:
# 4) Preparar dataset para modelado (no excluimos empresas; solo filas sin target se ignoran en entrenamiento)
feature_cols = [f'ganancia_lag_{i}' for i in range(1,5)] + [f'ingresos_lag_{i}' for i in range(1,5)] + ['n_ganancia_lags','n_ingresos_lags','ganancia_roll_mean_3','is_outlier_kde']
# añadir algunas columnas numéricas originales si interesa
for add in ['activos','pasivos','patrimonio']:
    if add in df.columns:
        feature_cols.append(add)

df_model = df[df['ganancias_next'].notna()].copy()  # filas con target t+1 disponibles
print('Filas disponibles para entrenamiento/evaluación:', df_model.shape[0])


Filas disponibles para entrenamiento/evaluación: 26024


### Feature Engineering

In [8]:
df_features = df_model.copy()

# A) Add sector dummies (companies in same sector may have similar patterns)
if 'macrosector' in df_features.columns:
    sector_dummies = pd.get_dummies(df_features['macrosector'], prefix='sector', drop_first=True)
    df_features = pd.concat([df_features, sector_dummies], axis=1)
    sector_cols = sector_dummies.columns.tolist()
    print(f'Added {len(sector_cols)} sector dummies')
else:
    sector_cols = []

# B) Add revenue as feature (larger companies may be more predictable)
if 'ingresos' in df_features.columns:
    df_features['ingresos_current'] = df_features['ingresos']

# C) Add profitability ratios from lag_1 (profit margin trend)
if 'ganancia_lag_1' in df_features.columns and 'ingresos_lag_1' in df_features.columns:
    # Avoid division by zero
    df_features['margin_lag_1'] = df_features['ganancia_lag_1'] / (df_features['ingresos_lag_1'].replace(0, np.nan) + 1e-3)
    df_features['margin_lag_1'] = df_features['margin_lag_1'].fillna(0)
    print('Added profit margin from lag_1')

# D) Add volatility indicator (if company had very different profits, harder to predict)
if 'ganancia_lag_1' in df_features.columns and 'ganancia_lag_2' in df_features.columns:
    df_features['ganancia_volatility'] = (df_features['ganancia_lag_1'] - df_features['ganancia_lag_2']).abs()
    df_features['ganancia_volatility'] = df_features['ganancia_volatility'].fillna(0)
    print('Added profit volatility feature')

# E) Add company size category (small/medium/large based on revenue)
if 'ingresos' in df_features.columns:
    df_features['size_category'] = pd.qcut(df_features['ingresos'].rank(method='first'),
                                            q=3, labels=['small','medium','large'], duplicates='drop')
    size_dummies = pd.get_dummies(df_features['size_category'], prefix='size', drop_first=True)
    df_features = pd.concat([df_features, size_dummies], axis=1)
    size_cols = size_dummies.columns.tolist()
    print(f'Added {len(size_cols)} size category dummies')

# Build updated feature list
new_feature_cols = feature_cols.copy()
new_feature_cols.extend(sector_cols)
new_feature_cols.extend(['ingresos_current'])
new_feature_cols.extend(['margin_lag_1', 'ganancia_volatility'])
new_feature_cols.extend(size_cols)

# Prepare updated X with new features
X_new = df_features[new_feature_cols].copy()
for col in X_new.columns:
    if X_new[col].dtype == 'object':
        X_new[col] = X_new[col].astype(int)
    else:
        X_new[col] = pd.to_numeric(X_new[col], errors='coerce')

# Impute NaNs in new features
from sklearn.impute import SimpleImputer
imp_new = SimpleImputer(strategy='median')
X_new_imputed = pd.DataFrame(imp_new.fit_transform(X_new), columns=X_new.columns)

print(f'Updated feature set: {X_new_imputed.shape[0]} rows, {X_new_imputed.shape[1]} features (was {X.shape[1]})')


Added profit margin from lag_1
Added profit volatility feature
Added 2 size category dummies


ValueError: invalid literal for int() with base 10: '21.0'

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler

# Use the engineered features (with sector dummies, volatility, etc.)
# Build comprehensive feature set
df_final = df_model.copy()

# Add sector dummies
sector_dummies = pd.get_dummies(df_final['MACROSECTOR'], prefix='sector', drop_first=True)
df_final = pd.concat([df_final, sector_dummies], axis=1)
sector_cols = sector_dummies.columns.tolist()

# Add revenue
df_final['ingresos_current'] = df_final['INGRESOS OPERACIONALES']

# Add margin and volatility
df_final['margin_lag_1'] = df_final['ganancia_lag_1'] / (df_final['ingresos_lag_1'].replace(0, np.nan) + 1e-3)
df_final['margin_lag_1'] = df_final['margin_lag_1'].fillna(0)
df_final['ganancia_volatility'] = (df_final['ganancia_lag_1'] - df_final['ganancia_lag_2']).abs().fillna(0)

# Prepare final feature list
final_feature_cols = feature_cols.copy() + sector_cols + ['ingresos_current', 'margin_lag_1', 'ganancia_volatility']

# Prepare X with proper imputation
X_final = df_final[final_feature_cols].copy()
for col in X_final.columns:
    X_final[col] = pd.to_numeric(X_final[col], errors='coerce')

# Impute missing values
imp_final = SimpleImputer(strategy='median')
X_final_imputed = pd.DataFrame(imp_final.fit_transform(X_final), columns=X_final.columns)

# Define targets
y_absolute = pd.to_numeric(df_final['GANANCIA_REAL_next'], errors='coerce')

y_margin = (df_final['GANANCIA_REAL_next'] / 
            (df_final['INGRESOS OPERACIONALES'].replace(0, np.nan) + 1e-3)) * 100
y_margin = y_margin.fillna(0)

print(f'\nFeature set: {X_final_imputed.shape[0]} rows, {X_final_imputed.shape[1]} features')
print(f'  Includes: {len(sector_cols)} sector dummies, volatility, margin, revenue features')

# Train with PROFIT MARGIN target using GroupKFold
print('\n' + '='*80)
print('TRAINING WITH PROFIT MARGIN TARGET (GroupKFold)')
print('='*80)

gkf_final = GroupKFold(n_splits=5)
margin_rmses, margin_maes = [], []
fold_num = 0

for train_idx, val_idx in gkf_final.split(X_final_imputed, groups=df_final['NIT'].values):
    fold_num += 1
    X_tr = X_final_imputed.iloc[train_idx]
    X_val = X_final_imputed.iloc[val_idx]
    y_tr = y_margin.values[train_idx]
    y_val = y_margin.values[val_idx]
    
    val_nits = df_final.iloc[val_idx]['NIT'].nunique()
    
    try:
        import lightgbm as lgb
        dtrain = lgb.Dataset(X_tr.values, label=y_tr)
        # Tuned params: lower learning rate, higher regularization for robustness
        params = {
            'objective':'regression',
            'metric':'rmse',
            'verbosity':-1,
            'num_leaves':15,
            'learning_rate':0.02,
            'reg_alpha':0.5,
            'reg_lambda':0.5,
            'min_data_in_leaf':10,
            'max_depth':6
        }
        bst = lgb.train(params, dtrain, num_boost_round=500)
        preds = bst.predict(X_val.values)
    except Exception as e:
        imp_fold = SimpleImputer(strategy='median')
        X_tr_imp = imp_fold.fit_transform(X_tr)
        X_val_imp = imp_fold.transform(X_val)
        rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
        rf.fit(X_tr_imp, y_tr)
        preds = rf.predict(X_val_imp)
    
    preds = np.asarray(preds)
    mask_ok = ~np.isnan(preds) & ~np.isnan(y_val)
    
    if mask_ok.sum() > 0:
        rmse = np.sqrt(mean_squared_error(y_val[mask_ok], preds[mask_ok]))
        mae = mean_absolute_error(y_val[mask_ok], preds[mask_ok])
        margin_rmses.append(rmse)
        margin_maes.append(mae)
        print(f'Fold {fold_num}: RMSE {rmse:.2f}%, MAE {mae:.2f}% | Val NITs: {val_nits}')

if margin_rmses:
    mean_rmse = np.mean(margin_rmses)
    std_rmse = np.std(margin_rmses)
    mean_mae = np.mean(margin_maes)
    std_mae = np.std(margin_maes)
    
    print(f'\n✓ PROFIT MARGIN GroupKFold Results:')
    print(f'  CV Mean RMSE: {mean_rmse:.2f}% (+/- {std_rmse:.2f}%)')
    print(f'  CV Mean MAE:  {mean_mae:.2f}% (+/- {std_mae:.2f}%)')
    print(f'  Improvement: Std Dev {std_rmse:.2f} (vs {np.std(gkf_rmses):.2f} for absolute profit)')
    
    if std_rmse < np.std(gkf_rmses):
        print(f'\n MARGIN TARGET IS MORE STABLE (+{((np.std(gkf_rmses) - std_rmse) / np.std(gkf_rmses) * 100):.1f}% improvement)')
    else:
        print(f'\n Margin still unstable; consider sector-specific models')