# 🧪 Feature Store Test - Développement de Nouvelles Features

## Architecture
Ce notebook facilite le développement et test de nouvelles features pour le backtesting :
- **Source** : Gold Features principales (production)
- **Traitement** : Ajout de nouvelles features expérimentales
- **Sortie** : Table Gold Test pour backtesting avancé
- **Cohérence** : Préservation de la continuité des calculs sur tout l'historique

## Workflow
1. **Configuration** : Paramètres et chemins
2. **Chargement** : Features Gold existantes
3. **Ajout Features** : Nouvelles features expérimentales
4. **Sauvegarde** : Table Gold Test pour backtesting

---

## 1. 📦 Configuration

In [None]:
import polars as pl
import duckdb
import numpy as np
import talib as ta
from datetime import datetime, timedelta
from dataclasses import dataclass
from typing import Optional, Dict, List
import time

@dataclass
class FeatureStoreTestConfig:
    # === PARAMÈTRES DE DONNÉES ===
    provider: str = "binance"
    market: str = "spot"
    data_frequency: str = "monthly"
    data_category: str = "klines"
    symbol: str = "BTCUSDT"
    interval: str = "4h"
    
    # === ARCHITECTURE MEDALLION ===
    gold_bucket: str = "gold"
    test_bucket: str = "test"
    
    # === PÉRIODE ===
    start_date: Optional[str] = "2017-09-01"  # 7 ans d'historique OU None pour tout
    
    # === CHUNKING & CONTINUITÉ ===
    chunk_size: int = 100_000  # Lignes par chunk
    context_buffer: int = 200   # Buffer plus large pour features complexes
    
    # === NOUVELLES FEATURES (à développer) ===
    # Paramètres pour nouvelles features expérimentales
    experimental_features: Dict = None
    
    # === MINIO ===
    minio_endpoint: str = "127.0.0.1:9000"
    minio_access: str = "minioadm"
    minio_secret: str = "minioadm"
    
    def __post_init__(self):
        if self.experimental_features is None:
            self.experimental_features = {
                # Exemple de paramètres pour futures features
                "volume_profile": {"bins": 20, "lookback": 100},
                "support_resistance": {"window": 50, "min_strength": 3},
                "market_regime": {"volatility_window": 30, "trend_window": 50}
            }
    
    # === CHEMINS CALCULÉS (Architecture Hermes) ===
    @property
    def feature_store_table(self) -> str:
        """Nom de la table Gold Features principale"""
        return f"gold_features_{self.market}_{self.data_frequency}_{self.data_category}_{self.symbol}_{self.interval}"
    
    @property
    def feature_store_test_table(self) -> str:
        """Nom de la table Gold Features Test (avec nouvelles features)"""
        return f"gold_features_test_{self.market}_{self.data_frequency}_{self.data_category}_{self.symbol}_{self.interval}"
    
    @property
    def source_path(self) -> str:
        """Chemin source Gold Features principales"""
        return f"s3://{self.gold_bucket}/{self.feature_store_table}/**/*.parquet"
    
    @property
    def output_path(self) -> str:
        """Chemin de sortie Gold Features Test"""
        return f"s3://{self.test_bucket}/{self.feature_store_test_table}/"

config = FeatureStoreTestConfig()
print(f"✅ Configuration chargée - {config.provider} {config.symbol} {config.interval}")
print(f"📅 Période: {config.start_date or 'Tout historique'} → maintenant")
print(f"🔄 Chunks: {config.chunk_size:,} lignes avec buffer {config.context_buffer}")
print(f"\n📁 CHEMINS ARCHITECTURE HERMES:")
print(f"   • Source: {config.source_path}")
print(f"   • Test Output: {config.output_path}")
print(f"   • Table Source: {config.feature_store_table}")
print(f"   • Table Test: {config.feature_store_test_table}")

## 2. 🔌 Connexion & Analyse

In [None]:
# Initialisation DuckDB + MinIO
con = duckdb.connect()
con.execute(f"SET s3_endpoint='{config.minio_endpoint}';")
con.execute(f"SET s3_access_key_id='{config.minio_access}';")
con.execute(f"SET s3_secret_access_key='{config.minio_secret}';")
con.execute("SET s3_url_style='path'; SET s3_use_ssl='false';")
con.execute("SET threads TO 6; SET memory_limit = '4GB';")

# Analyse rapide des données source
date_filter = f"AND datetime >= '{config.start_date}'" if config.start_date else ""

try:
    summary = con.execute(f"""
        SELECT 
            COUNT(*) as total_rows,
            MIN(datetime) as start_date,
            MAX(datetime) as end_date,
            COUNT(DISTINCT DATE_TRUNC('day', datetime)) as unique_days
        FROM read_parquet('{config.source_path}')
        WHERE symbol = '{config.symbol}' {date_filter}
    """).fetchone()
    
    total_rows, start_date, end_date, days = summary
    estimated_chunks = (total_rows // config.chunk_size) + 1
    
    print(f"📊 DONNÉES SOURCE DISPONIBLES:")
    print(f"   • Lignes totales: {total_rows:,}")
    print(f"   • Période: {start_date} → {end_date}")
    print(f"   • Jours uniques: {days:,}")
    print(f"   • Chunks estimés: {estimated_chunks:,}")
    print(f"   • Temps estimé: ~{estimated_chunks * 3:.0f} secondes")
    
    # Vérifier les colonnes disponibles
    schema_result = con.execute(f"""
        DESCRIBE SELECT * FROM read_parquet('{config.source_path}') LIMIT 1
    """).fetchall()
    
    available_columns = [row[0] for row in schema_result]
    print(f"\n📊 COLONNES DISPONIBLES ({len(available_columns)}):")
    
    # Grouper par type
    base_cols = [col for col in available_columns if col in ['datetime', 'symbol', 'open', 'high', 'low', 'close', 'volume']]
    indicator_cols = [col for col in available_columns if col not in base_cols]
    
    print(f"   • Colonnes de base: {base_cols}")
    print(f"   • Indicateurs disponibles: {len(indicator_cols)}")
    if len(indicator_cols) > 0:
        print(f"     - Premiers indicateurs: {indicator_cols[:10]}")
        if len(indicator_cols) > 10:
            print(f"     - ... et {len(indicator_cols)-10} autres")
            
except Exception as e:
    print(f"❌ Erreur lors de l'analyse: {e}")
    total_rows, estimated_chunks = 0, 0

## 3. 🧪 Générateur de Nouvelles Features

In [None]:
class ExperimentalFeatureGenerator:
    """
    🧪 GÉNÉRATEUR DE FEATURES EXPÉRIMENTALES
    
    Ce générateur permet d'ajouter facilement de nouvelles features
    aux données existantes tout en préservant la cohérence des calculs.
    """
    
    def __init__(self, config: FeatureStoreTestConfig):
        self.config = config
    
    def add_experimental_features(self, df: pl.DataFrame) -> pl.DataFrame:
        """
        Ajoute les nouvelles features expérimentales aux données
        
        Pour l'instant, aucune nouvelle feature n'est implémentée.
        Cette fonction servira de template pour de futures features.
        """
        
        print(f"🧪 Ajout de features expérimentales sur {len(df)} lignes...")
        
        # Commencer avec les données originales
        enhanced_df = df.clone()
        
        # === TEMPLATE POUR FUTURES FEATURES ===
        # Exemple de structure pour ajouter de nouvelles features:
        
        # 1. Feature basée sur les volumes
        # enhanced_df = enhanced_df.with_columns([
        #     # Volume profile ou volume relatif
        #     (pl.col('volume') / pl.col('volume').rolling_mean(20)).alias('volume_relative'),
        #     # Volume momentum
        #     (pl.col('volume') / pl.col('volume').shift(1) - 1).alias('volume_momentum')
        # ])
        
        # 2. Features de volatilité avancées
        # enhanced_df = enhanced_df.with_columns([
        #     # Volatilité réalisée
        #     ((pl.col('high') - pl.col('low')) / pl.col('close')).rolling_std(14).alias('realized_volatility'),
        #     # Volatilité asymétrique
        #     pl.when(pl.col('close') > pl.col('close').shift(1))
        #       .then((pl.col('close') / pl.col('close').shift(1) - 1))
        #       .otherwise(0).rolling_std(14).alias('upside_volatility')
        # ])
        
        # 3. Features de microstructure
        # enhanced_df = enhanced_df.with_columns([
        #     # Spread bid-ask estimé
        #     ((pl.col('high') - pl.col('low')) / pl.col('close')).alias('estimated_spread'),
        #     # Price impact estimé
        #     (abs(pl.col('close') - pl.col('open')) / pl.col('volume')).alias('price_impact')
        # ])
        
        # === MÉTADONNÉES POUR TRACKING ===
        enhanced_df = enhanced_df.with_columns([
            # Timestamp de génération
            pl.lit(datetime.now().isoformat()).alias('features_generated_at'),
            # Version des features
            pl.lit('v1.0.0').alias('features_version'),
            # Type de table
            pl.lit('test').alias('table_type')
        ])
        
        # Compter les nouvelles colonnes
        original_cols = len(df.columns)
        new_cols = len(enhanced_df.columns)
        added_features = new_cols - original_cols
        
        print(f"   ✅ {added_features} nouvelles features ajoutées")
        print(f"   📊 Total colonnes: {original_cols} → {new_cols}")
        
        return enhanced_df
    
    def validate_features(self, df: pl.DataFrame) -> Dict:
        """Valide la qualité des nouvelles features"""
        
        validation = {
            'total_rows': len(df),
            'total_columns': len(df.columns),
            'null_counts': {},
            'warnings': []
        }
        
        # Vérifier les valeurs nulles
        for col in df.columns:
            null_count = df.select(pl.col(col).is_null().sum()).item()
            if null_count > 0:
                validation['null_counts'][col] = null_count
                if null_count > len(df) * 0.1:  # Plus de 10% de nulls
                    validation['warnings'].append(f"Colonne {col}: {null_count} valeurs nulles ({null_count/len(df)*100:.1f}%)")
        
        return validation

# Initialisation du générateur
feature_generator = ExperimentalFeatureGenerator(config)

print("🧪 Générateur de features expérimentales initialisé")
print("💡 Prêt pour l'ajout de nouvelles features personnalisées")
print("📝 Modifiez la méthode add_experimental_features() pour ajouter vos features")

## 4. 🔄 Traitement par Chunks avec Continuité

In [None]:
def process_features_chunked() -> List[pl.DataFrame]:
    """
    🔄 TRAITEMENT PAR CHUNKS AVEC CONTINUITÉ GARANTIE
    
    Principe :
    1. Charger chunk des features Gold existantes
    2. Ajouter buffer de contexte pour continuité des calculs
    3. Générer nouvelles features expérimentales
    4. Extraire résultats sans contexte
    5. Sauvegarder contexte pour chunk suivant
    """
    
    if total_rows == 0:
        print("❌ Pas de données à traiter")
        return []
    
    print("🚀 TRAITEMENT FEATURES PAR CHUNKS")
    print("=" * 45)
    
    all_results = []
    context_buffer = None  # Buffer pour continuité
    start_time = time.time()
    
    # Filtre de date
    date_filter = f"AND datetime >= '{config.start_date}'" if config.start_date else ""
    
    # Traitement chunk par chunk
    for offset in range(0, total_rows, config.chunk_size):
        chunk_num = (offset // config.chunk_size) + 1
        current_size = min(config.chunk_size, total_rows - offset)
        
        print(f"[{chunk_num:>3}/{estimated_chunks}] Chunk {offset:,}-{offset+current_size:,}", end=" | ")
        
        try:
            # === CHARGEMENT CHUNK ===
            chunk_start = time.time()
            
            # Calculer offset avec contexte
            actual_offset = offset
            actual_limit = current_size
            
            # Ajouter contexte si pas le premier chunk
            if offset > 0 and context_buffer is None:
                actual_offset = max(0, offset - config.context_buffer)
                actual_limit = current_size + (offset - actual_offset)
            
            # Requête DuckDB
            query = f"""
                SELECT *
                FROM read_parquet('{config.source_path}')
                WHERE symbol = '{config.symbol}' {date_filter}
                ORDER BY datetime
                LIMIT {actual_limit} OFFSET {actual_offset}
            """
            
            chunk_df = pl.from_arrow(con.execute(query).arrow())
            
            if len(chunk_df) == 0:
                print("⚠️ Chunk vide")
                break
            
            # === AJOUT CONTEXTE ===
            if context_buffer is not None and offset > 0:
                # Éviter doublons temporels
                last_context_time = context_buffer['datetime'].max()
                chunk_df = chunk_df.filter(pl.col('datetime') > last_context_time)
                
                if len(chunk_df) > 0:
                    chunk_df = pl.concat([context_buffer, chunk_df])
            
            # === GÉNÉRATION NOUVELLES FEATURES ===
            enhanced_df = feature_generator.add_experimental_features(chunk_df)
            
            # === EXTRACTION RÉSULTATS ===
            context_size = len(context_buffer) if context_buffer is not None and offset > 0 else 0
            
            if context_size > 0:
                result_df = enhanced_df.slice(context_size)  # Skip contexte
            else:
                result_df = enhanced_df
            
            # === MISE À JOUR CONTEXTE ===
            if len(enhanced_df) > config.context_buffer:
                context_buffer = enhanced_df.tail(config.context_buffer)
            
            # === VALIDATION ===
            validation = feature_generator.validate_features(result_df)
            
            # === MÉTRIQUES ===
            chunk_time = time.time() - chunk_start
            rows_per_sec = len(result_df) / max(chunk_time, 0.001)
            
            print(f"{len(result_df):>5} lignes | {validation['total_columns']:>3} cols | ⚡ {chunk_time:.1f}s | {rows_per_sec:>6.0f} l/s")
            
            # Afficher warnings de validation
            for warning in validation.get('warnings', []):
                print(f"    ⚠️ {warning}")
            
            # Ajouter aux résultats
            if len(result_df) > 0:
                all_results.append(result_df)
                
        except Exception as e:
            print(f"❌ Erreur: {e}")
            break
    
    # === RÉSUMÉ ===
    total_time = time.time() - start_time
    total_processed = sum(len(df) for df in all_results)
    total_columns = len(all_results[0].columns) if all_results else 0
    
    print("=" * 45)
    print(f"✅ TRAITEMENT TERMINÉ")
    print(f"📊 Lignes traitées: {total_processed:,}")
    print(f"📊 Colonnes finales: {total_columns}")
    print(f"🧪 Features ajoutées: {total_columns - len(available_columns) if 'available_columns' in locals() else 'N/A'}")
    print(f"⏱️ Temps total: {total_time:.1f}s")
    print(f"⚡ Performance: {total_processed/max(total_time, 0.1):,.0f} lignes/sec")
    
    return all_results

# Traitement des features
if total_rows > 0:
    enhanced_chunks = process_features_chunked()
else:
    enhanced_chunks = []

## 5. 💾 Sauvegarde Gold Features Test

In [None]:
def save_to_gold_test(chunks: List[pl.DataFrame]) -> bool:
    """Sauvegarde les features enrichies dans Gold Test Layer"""
    
    if not chunks:
        print("❌ Pas de features à sauvegarder")
        return False
    
    print("💾 SAUVEGARDE GOLD FEATURES TEST")
    print("=" * 35)
    
    try:
        # Consolidation
        final_features = pl.concat(chunks)
        print(f"📊 Consolidation: {len(final_features):,} lignes")
        print(f"📊 Colonnes totales: {len(final_features.columns)}")
        
        # Ajout métadonnées Gold Test
        final_features = final_features.with_columns([
            pl.col('datetime').dt.year().alias('year'),
            pl.col('datetime').dt.month().alias('month'),
            pl.lit("test").alias("layer"),
            pl.lit("feature_store").alias("data_type"),
            pl.lit(config.provider).alias("provider"),
            pl.lit(config.market).alias("market"),
            pl.lit(config.data_frequency).alias("data_frequency"),
            pl.lit(config.data_category).alias("data_category"),
            pl.lit(config.interval).alias("interval")
        ])
        
        # Chemin Gold Test partitionné
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        gold_test_path = f"{config.output_path}version_{timestamp}/features.parquet"
        
        # Export via DuckDB
        con.register("temp_features", final_features.to_arrow())
        con.execute(f"""
            COPY (SELECT * FROM temp_features ORDER BY datetime)
            TO '{gold_test_path}'
            (FORMAT PARQUET, COMPRESSION 'snappy')
        """)
        
        print(f"✅ Sauvegarde réussie: {gold_test_path}")
        print(f"📊 Période: {final_features['datetime'].min()} → {final_features['datetime'].max()}")
        print(f"🏛️ Architecture: Medallion Gold Test Layer")
        print(f"🧪 Prêt pour backtesting avancé")
        
        # Sauvegarde métadonnées
        metadata = {
            'table_info': {
                'name': config.feature_store_test_table,
                'type': 'feature_store_test',
                'version': timestamp,
                'source_table': config.feature_store_table
            },
            'data_info': {
                'total_rows': len(final_features),
                'total_columns': len(final_features.columns),
                'period_start': str(final_features['datetime'].min()),
                'period_end': str(final_features['datetime'].max()),
                'symbol': config.symbol,
                'interval': config.interval
            },
            'features_info': {
                'experimental_features': config.experimental_features,
                'generation_timestamp': datetime.now().isoformat()
            },
            'paths': {
                'gold_test_path': gold_test_path,
                'source_path': config.source_path
            }
        }
        
        metadata_path = f"{config.output_path}version_{timestamp}/metadata.parquet"
        metadata_df = pl.DataFrame([metadata])
        con.register("temp_metadata", metadata_df.to_arrow())
        con.execute(f"""
            COPY (SELECT * FROM temp_metadata)
            TO '{metadata_path}'
            (FORMAT PARQUET)
        """)
        
        print(f"📋 Métadonnées sauvegardées: {metadata_path}")
        
        return True
        
    except Exception as e:
        print(f"❌ Erreur sauvegarde: {e}")
        
        # Sauvegarde locale de secours
        try:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            local_path = f"/tmp/features_test_{config.symbol}_{timestamp}.parquet"
            pl.concat(chunks).write_parquet(local_path)
            print(f"💾 Sauvegarde locale: {local_path}")
            return True
        except:
            return False

# Sauvegarde
if enhanced_chunks:
    save_success = save_to_gold_test(enhanced_chunks)
else:
    save_success = False
    print("❌ Aucune feature à sauvegarder")

## 6. 📊 Résumé et Instructions

In [None]:
# Résumé final
print("=" * 60)
print("🧪 FEATURE STORE TEST - RÉSUMÉ FINAL")
print("=" * 60)

if enhanced_chunks:
    total_rows = sum(len(chunk) for chunk in enhanced_chunks)
    total_columns = len(enhanced_chunks[0].columns)
    original_columns = len(available_columns) if 'available_columns' in locals() else 0
    new_features = total_columns - original_columns
    
    print(f"📊 TRAITEMENT:")
    print(f"   • Période: {start_date} → {end_date}")
    print(f"   • Lignes traitées: {total_rows:,}")
    print(f"   • Chunks: {len(enhanced_chunks)}")
    
    print(f"\n🧪 FEATURES:")
    print(f"   • Colonnes originales: {original_columns}")
    print(f"   • Colonnes finales: {total_columns}")
    print(f"   • Nouvelles features: {new_features}")
    
    print(f"\n💾 SAUVEGARDE: {'✅ Réussie' if save_success else '❌ Échouée'}")
    print(f"🏛️ Architecture: Medallion Gold Test Layer")
    print(f"📁 Table: {config.feature_store_test_table}")
    
else:
    print("❌ Aucune feature générée")

print(f"\n⚙️ CONFIGURATION:")
print(f"   • Provider: {config.provider}")
print(f"   • Symbole: {config.symbol} {config.interval}")
print(f"   • Market: {config.market}")
print(f"   • Frequency: {config.data_frequency}")
print(f"   • Chunks: {config.chunk_size:,} + buffer {config.context_buffer}")

print(f"\n🚀 PROCHAINES ÉTAPES:")
print("   1. Modifier add_experimental_features() pour ajouter vos features")
print("   2. Tester sur petit échantillon avant historique complet")
print("   3. Utiliser la table test dans strategy_chunked_backtesting")
print("   4. Valider les nouvelles features avec backtesting")

print("\n💡 EXEMPLES DE FEATURES À AJOUTER:")
print("   • Volume relatif et momentum de volume")
print("   • Volatilité réalisée et asymétrique")
print("   • Support/résistance dynamiques")
print("   • Régime de marché (trend/sideways)")
print("   • Features de microstructure")

print("=" * 60)
print("✅ FEATURE STORE TEST OPÉRATIONNEL")
print("🧪 Prêt pour développement de nouvelles features")
print("=" * 60)