## Configs iniciais

In [5]:
# === 0) Ajustes
GITHUB_USER = "edumcm"
REPO_NAME   = "tcc-demand-forecasting"
REPO_URL    = f"https://github.com/{GITHUB_USER}/{REPO_NAME}.git"

# === 1) Montar Drive ===
from google.colab import drive
drive.mount('/content/drive')

# === 2) Definir caminhos ===
import os, pathlib, random, numpy as np, textwrap, shutil

DRIVE_BASE = "/content/drive/MyDrive/tcc-modelo"
REPO_DIR   = f"{DRIVE_BASE}/{REPO_NAME}"
TMP_CLONE  = f"/content/{REPO_NAME}"

pathlib.Path(DRIVE_BASE).mkdir(parents=True, exist_ok=True)

# === 3) Garantir repo no Drive ===
if os.path.exists(REPO_DIR):
    # Já existe no Drive -> NÃO dar pull automático
    %cd {REPO_DIR}
    BRANCH = "main"  # ajuste se for 'master'
    # (Opcional) só avisa se o remoto avançou
    !git fetch -q origin
    ahead_behind = !git rev-list --left-right --count HEAD...origin/{BRANCH}
    if ahead_behind:
        ahead, behind = map(int, ahead_behind[0].split())
        if behind > 0:
            print(f"Atenção: o remoto está {behind} commit(s) à frente. "
                  "Como o fluxo é só push, ignore ou faça pull manualmente se quiser.")
else:
    # Não existe em lugar nenhum -> clonar direto no Drive
    %cd {DRIVE_BASE}
    !git clone {REPO_URL} {REPO_NAME}
    %cd {REPO_DIR}

# === 4) Instalar dependências ===
%pip install -q -r requirements.txt

# === 5) Configurar caminhos de dados/saídas (FORA do repo) ===
os.environ["DATA_DIR"]      = "/content/drive/MyDrive/tcc-modelo/data"
os.environ["ARTIFACTS_DIR"] = "/content/drive/MyDrive/tcc-modelo/artifacts"
os.environ["REPORTS_DIR"]   = "/content/drive/MyDrive/tcc-modelo/reports"
for d in [os.environ["DATA_DIR"], os.environ["ARTIFACTS_DIR"], os.environ["REPORTS_DIR"]]:
    os.makedirs(d, exist_ok=True)

# === 6) Seeds e threads ===
SEED = 42
np.random.seed(SEED); random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

# === 7) Conferir GPU ===
import tensorflow as tf
print("TensorFlow:", tf.__version__, "| GPUs disponíveis:", len(tf.config.list_physical_devices('GPU')))
print("DATA_DIR:", os.environ["DATA_DIR"])


Mounted at /content/drive
/content/drive/MyDrive/tcc-modelo/tcc-demand-forecasting
TensorFlow: 2.19.0 | GPUs disponíveis: 1
DATA_DIR: /content/drive/MyDrive/tcc-modelo/data


## configs yaml

In [None]:
import os, yaml, datetime, pathlib, json, sys

PROJECT_ROOT = REPO_DIR
SRC_DIR = f"{PROJECT_ROOT}/src"
os.makedirs(SRC_DIR, exist_ok=True)
if SRC_DIR not in sys.path: sys.path.append(SRC_DIR)

CONFIG_DIR = f"{PROJECT_ROOT}/configs"
os.makedirs(CONFIG_DIR, exist_ok=True)

# 1) data.yaml
data_cfg = {
  "paths": {
    "data_dir": os.environ["DATA_DIR"],
    "artifacts_dir": os.environ["ARTIFACTS_DIR"],
    "reports_dir": os.environ["REPORTS_DIR"]
  },
  "datasets": {
    "olist": {
      "raw_dir": f'{os.environ["DATA_DIR"]}/raw',
      "interim_dir": f'{os.environ["DATA_DIR"]}/interim',
      "processed_dir": f'{os.environ["DATA_DIR"]}/processed'
    }
  },
  "timezone": "America/Fortaleza",
  "date_format": "%Y-%m-%d"
}

# 2) cv.yaml (backtest)
cv_cfg = {
  "strategy": "expanding_window",
  "horizons": [7, 28],
  "n_folds": 6,
  "gap_days": 2,
  "stride": 7,
  "group_col": "sku_id",         # ajustar pós análise exploratória
  "time_col": "order_purchase_timestamp"
}

# 3) features.yaml
feat_cfg = {
  "target": "qty",               # ajustar pós análise exploratória
  "id_col": "sku_id",
  "time_col": "order_purchase_timestamp",
  "calendar": ["dow","dom","week","month","is_holiday","bf_flag"],
  "lags": [1, 7, 28],
  "rollings": [
    {"lag":1, "window":7, "fn":"mean"},
    {"lag":1, "window":28, "fn":"mean"},
    {"lag":1, "window":7, "fn":"std"}
  ],
  "price_features": ["price_rel_sku", "price_rel_cat"],   # ajustar pós análise exploratória
  "leakage_guard": True
}

for name, cfg in [("data.yaml", data_cfg), ("cv.yaml", cv_cfg), ("features.yaml", feat_cfg)]:
    p = f"{CONFIG_DIR}/{name}"
    if not os.path.exists(p):
        with open(p, "w") as f:
            yaml.safe_dump(cfg, f, sort_keys=False)
        print("Criado:", p)


## git ignore

In [None]:
gi = f"{PROJECT_ROOT}/.gitignore"
lines = [
    "\n# Local/colab\n.ipynb_checkpoints/\n*.ipynb_meta.json\n",
    "# Environments / caches\n__pycache__/\n*.pyc\n",
    "# Data & outputs (mantidos fora do repo, mas por via das dúvidas)\n/data/\n/artifacts/\n/reports/\n",
]
if os.path.exists(gi):
    with open(gi,"a") as f: f.writelines(lines)
    print("Atualizado .gitignore")

Atualizado .gitignore


## snapshot de ambiente

In [None]:
snap_dir = f'{os.environ["ARTIFACTS_DIR"]}/config_snapshots'
os.makedirs(snap_dir, exist_ok=True)
ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
!pip freeze > "{snap_dir}/requirements-{ts}.txt"
print("Snapshot salvo em:", f"{snap_dir}/requirements-{ts}.txt")

Snapshot salvo em: /content/drive/MyDrive/tcc-modelo/artifacts/config_snapshots/requirements-20250923-223540.txt


## seed hard + logging enxuto

In [None]:
import logging, random, numpy as np, os, tensorflow as tf
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%H:%M:%S"
)

def fix_seeds(seed=42):
    random.seed(seed); np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    # Opcional: tentar determinismo (pode reduzir performance)
    os.environ["TF_DETERMINISTIC_OPS"] = "1"

fix_seeds(42)

## criando um loader.py

In [6]:
%%writefile src/data/loader.py
import os
import glob
import pandas as pd
import yaml

def load_dataset(cfg_path: str, dataset: str = "olist", stage: str = "raw") -> dict:
    """
    Lê todos os CSVs de um dataset em um stage (raw, interim, processed).
    Retorna um dicionário {nome_arquivo: DataFrame}.
    """
    with open(cfg_path, "r") as f:
        data_cfg = yaml.safe_load(f)

    base_dir = data_cfg["datasets"][dataset][f"{stage}_dir"]
    csv_files = glob.glob(os.path.join(base_dir, "*.csv"))
    dataframes = {os.path.basename(f): pd.read_csv(f) for f in csv_files}
    return dataframes

Writing src/data/loader.py
