# 🧪 Notebook d'Intégration : Validation de `process_file`

Ce notebook exécute une batterie de tests d'intégration sur la méthode `process_file` de `BinanceDataProcessorV2`.

Objectifs:
- Vérifier la détection des unités de timestamps (s / ms / µs)
- Vérifier l'idempotence (OVERWRITE_OR_IGNORE)
- Vérifier le partitionnement year/month/day
- Vérifier la présence et le format de `ingest_id`
- Vérifier la robustesse face aux archives corrompues ou invalides
- Mesurer des métriques simples de performance
- Produire un rapport récapitulatif final

---
Exécution : la dernière cellule orchestre tous les tests automatiquement.


In [None]:
# 1. Imports & Paramètres de Test
import os, io, zipfile, datetime, tempfile, shutil, random, time, re
from pathlib import Path
import duckdb, polars as pl

SEED = 42
random.seed(SEED)
pl.Config.set_tbl_rows(30)

USE_MINIO = bool(int(os.getenv('USE_MINIO', '0')))
RUN_ID = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%S')
print(f"USE_MINIO={USE_MINIO} | RUN_ID={RUN_ID}")

In [None]:
# 2. Chargement / Définition de la classe cible
try:
    from pipeline.binance import BinanceDataProcessorV2  # type: ignore
    CLASS_IMPORTED = True
except Exception:
    CLASS_IMPORTED = False
    import duckdb, polars as pl, datetime, zipfile
    class BinanceDataProcessorV2:
        def __init__(self, config: dict):
            self.config = config
            self.csv_columns = [
                'open_time','open','high','low','close','volume','close_time',
                'quote_asset_volume','number_of_trades','taker_buy_base_volume',
                'taker_buy_quote_volume','ignore'
            ]
            self.errors = []
            self.processed_files = 0
        def setup_duckdb_connection(self):
            con = duckdb.connect(database=':memory:')
            con.execute(f"""
                SET s3_access_key_id='{self.config.get('minio_access_key','')}';
                SET s3_secret_access_key='{self.config.get('minio_secret_key','')}';
                SET s3_endpoint='{self.config.get('minio_endpoint','')}';
                SET s3_url_style='path';
                SET s3_use_ssl='false';
            """)
            return con
        def process_file(self, zip_path: Path, con: duckdb.DuckDBPyConnection, output_path: str):
            try:
                with zipfile.ZipFile(str(zip_path)) as z:
                    csv_candidates = [n for n in z.namelist() if n.lower().endswith('.csv')]
                    if not csv_candidates:
                        raise ValueError('Aucun CSV dans le zip')
                    csv_name = csv_candidates[0]
                    with z.open(csv_name) as f:
                        df = pl.read_csv(f, has_header=False, new_columns=self.csv_columns)
                df = df.with_columns([
                    pl.col('open_time').cast(pl.Int64, strict=False),
                    pl.col('close_time').cast(pl.Int64, strict=False),
                ])
                first_ts = df.select(pl.col('open_time').first()).item()
                if first_ts > 10_000_000_000_000:
                    ts_col = (pl.col('open_time') // 1000).cast(pl.Datetime('ms'))
                elif first_ts > 10_000_000_000:
                    ts_col = pl.col('open_time').cast(pl.Datetime('ms'))
                else:
                    ts_col = (pl.col('open_time') * 1000).cast(pl.Datetime('ms'))
                ingest_id = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') + '_' + zip_path.name
                df = df.with_columns([
                    ts_col.alias('datetime'),
                    pl.lit(ingest_id).alias('ingest_id'),
                ]).with_columns([
                    pl.col('datetime').dt.year().alias('year'),
                    pl.col('datetime').dt.month().alias('month'),
                    pl.col('datetime').dt.day().alias('day'),
                ])
                con.register('tmp_data', df.to_arrow())
                sql = f"""
                    COPY tmp_data
                    TO '{output_path}'
                    WITH (FORMAT PARQUET, PARTITION_BY (year, month, day), OVERWRITE_OR_IGNORE TRUE)
                """
                con.execute(sql)
                try: con.execute("DROP VIEW IF EXISTS tmp_data")
                except: pass
                self.processed_files += 1
                return True
            except Exception as e:
                self.errors.append({'file': zip_path.name, 'error': str(e)})
                try: con.execute("DROP VIEW IF EXISTS tmp_data")
                except: pass
                return False
print('Classe importée:' , CLASS_IMPORTED)

In [None]:
# 3. Configuration Backend (MinIO ou FS Local)
from typing import Tuple
try:
    from minio import Minio
except ImportError:
    Minio = None

def build_output_path():
    if USE_MINIO:
        return f"s3://bronze/binance/data/spot/monthly/klines/BTCUSDT/4h/test_int_{RUN_ID}/"
    else:
        return str((TMP_BASE / 'bronze' / 'binance' / 'data' / 'spot' / 'monthly' / 'klines' / 'BTCUSDT' / '4h' / f'test_int_{RUN_ID}' / ''))

if not USE_MINIO:
    TMP_BASE = Path(tempfile.mkdtemp(prefix='process_file_it_'))
    (TMP_BASE / 'bronze').mkdir(parents=True, exist_ok=True)
    print('Répertoire local de test:', TMP_BASE)
else:
    assert Minio is not None, 'minio package requis pour USE_MINIO=1'
    MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT','127.0.0.1:9000')
    MINIO_ACCESS_KEY = os.getenv('MINIO_ROOT_USER','minioadm')
    MINIO_SECRET_KEY = os.getenv('MINIO_ROOT_PASSWORD','minioadm')
    minio_client = Minio(MINIO_ENDPOINT, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=False)
    if not minio_client.bucket_exists('bronze'):
        minio_client.make_bucket('bronze')
    print('Bucket bronze OK')

OUTPUT_PATH = build_output_path()
print('OUTPUT_PATH =', OUTPUT_PATH)


In [None]:
# 4. Utilitaires: Génération OHLCV synthétiques

def make_ohlcv_df(n_rows: int, start_ts_ms: int, step_ms: int) -> pl.DataFrame:
    rows = []
    for i in range(n_rows):
        ot = start_ts_ms + i * step_ms
        row = [
            ot,                # open_time
            100 + i * 0.1,     # open
            100 + i * 0.15,    # high
            100 + i * 0.05,    # low
            100 + i * 0.12,    # close
            10 + i,            # volume
            ot + step_ms,      # close_time
            1000 + i,          # quote_asset_volume
            50 + i,            # number_of_trades
            5 + i * 0.1,       # taker_buy_base_volume
            500 + i,           # taker_buy_quote_volume
            0                  # ignore
        ]
        rows.append(row)
    return pl.DataFrame(rows, schema=[
        'open_time','open','high','low','close','volume','close_time',
        'quote_asset_volume','number_of_trades','taker_buy_base_volume',
        'taker_buy_quote_volume','ignore'
    ])

# 5. Utilitaires: Création ZIP test
def build_zip(df: pl.DataFrame, ts_unit: str, dest_zip: Path) -> Path:
    conv = df.clone()
    if ts_unit == 's':
        conv = conv.with_columns((pl.col('open_time') // 1000).alias('open_time'))
        conv = conv.with_columns((pl.col('close_time') // 1000).alias('close_time'))
    elif ts_unit == 'ms':
        pass
    elif ts_unit == 'us':
        conv = conv.with_columns((pl.col('open_time') * 1000).alias('open_time'))
        conv = conv.with_columns((pl.col('close_time') * 1000).alias('close_time'))
    else:
        raise ValueError('ts_unit invalide')
    csv_bytes = io.StringIO()
    conv.write_csv(csv_bytes, include_header=False)
    csv_bytes.seek(0)
    with zipfile.ZipFile(dest_zip, 'w', compression=zipfile.ZIP_DEFLATED) as z:
        z.writestr('BTCUSDT-4h-test.csv', csv_bytes.getvalue())
    return dest_zip


In [None]:
# 6. Fixture: init environnement

def init_env():
    config = {
        'provider':'binance','data_type':'data','market':'spot','data_frequency':'monthly',
        'data_category':'klines','symbol':'BTCUSDT','interval':'4h',
        'minio_access_key': os.getenv('MINIO_ROOT_USER','minioadm'),
        'minio_secret_key': os.getenv('MINIO_ROOT_PASSWORD','minioadm'),
        'minio_endpoint': os.getenv('MINIO_ENDPOINT','127.0.0.1:9000')
    }
    processor = BinanceDataProcessorV2(config)
    con = processor.setup_duckdb_connection()
    return processor, con, OUTPUT_PATH

processor, con, OUTPUT_PATH = init_env()
print('Environnement initialisé')