# Validation du Dataset - Pipeline de Donn√©es Crypto

Ce notebook valide la qualit√© du dataset g√©n√©r√© par le pipeline.

**V√©rifications:**
1. Pas de data leakage
2. Int√©grit√© OHLC
3. Distribution des labels
4. Qualit√© des features normalis√©es
5. Visualisation du signal filtr√©

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print("‚úÖ Imports OK")

## 1. Chargement du Dataset

In [None]:
# Charger le dataset g√©n√©r√©
dataset_path = '../data/processed/btc_30m_dataset.csv'

df = pd.read_csv(dataset_path, parse_dates=['timestamp', 'candle_30m_timestamp'])

print(f"Dataset charg√©: {len(df)} lignes, {len(df.columns)} colonnes")
print(f"P√©riode: {df['timestamp'].min()} √† {df['timestamp'].max()}")
print(f"\nPremi√®res colonnes: {list(df.columns[:10])}")

In [None]:
# Aper√ßu des donn√©es
df.head(10)

In [None]:
# Info sur le dataset
df.info()

## 2. Validation de la Bougie Fant√¥me

In [None]:
# V√©rifier les steps (doivent aller de 1 √† 6 pour 30min)
print("Distribution des steps:")
print(df['step'].value_counts().sort_index())

# Visualiser
df['step'].value_counts().sort_index().plot(kind='bar', figsize=(10, 4))
plt.title('Distribution des Steps dans les Bougies 30min')
plt.xlabel('Step (1-6)')
plt.ylabel('Count')
plt.show()

In [None]:
# V√©rifier l'int√©grit√© OHLC de la bougie fant√¥me
from utils import validate_ohlc_integrity

try:
    validate_ohlc_integrity(df, col_prefix='ghost_')
    print("‚úÖ Int√©grit√© OHLC: OK")
except ValueError as e:
    print(f"‚ùå Erreur d'int√©grit√© OHLC: {e}")

In [None]:
# Visualiser une bougie fant√¥me en formation
sample_candle = df[df['candle_30m_timestamp'] == df['candle_30m_timestamp'].iloc[100]]

fig, axes = plt.subplots(2, 2, figsize=(14, 8))

axes[0,0].plot(sample_candle['step'], sample_candle['ghost_open'], marker='o', label='Open')
axes[0,0].set_title('Ghost Open')
axes[0,0].set_xlabel('Step')
axes[0,0].legend()

axes[0,1].plot(sample_candle['step'], sample_candle['ghost_high'], marker='o', label='High', color='green')
axes[0,1].set_title('Ghost High')
axes[0,1].set_xlabel('Step')
axes[0,1].legend()

axes[1,0].plot(sample_candle['step'], sample_candle['ghost_low'], marker='o', label='Low', color='red')
axes[1,0].set_title('Ghost Low')
axes[1,0].set_xlabel('Step')
axes[1,0].legend()

axes[1,1].plot(sample_candle['step'], sample_candle['ghost_close'], marker='o', label='Close', color='blue')
axes[1,1].set_title('Ghost Close')
axes[1,1].set_xlabel('Step')
axes[1,1].legend()

plt.tight_layout()
plt.suptitle('√âvolution de la Bougie Fant√¥me', y=1.02, fontsize=14)
plt.show()

## 3. Validation des Labels

In [None]:
# Distribution des labels
print("Distribution des labels:")
label_counts = df['label'].value_counts()
print(label_counts)
print(f"\nPourcentage de labels positifs: {label_counts.get(1.0, 0) / label_counts.sum() * 100:.2f}%")

# Visualiser
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

label_counts.plot(kind='bar', ax=axes[0], color=['red', 'green'])
axes[0].set_title('Distribution des Labels')
axes[0].set_xlabel('Label (0=Baisse, 1=Hausse)')
axes[0].set_ylabel('Count')

label_counts.plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=['red', 'green'])
axes[1].set_title('Proportion des Labels')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Visualiser le signal filtr√© et les labels
sample = df.iloc[1000:1200].copy()

fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

# Signal filtr√©
if 'rsi_filtered' in sample.columns:
    axes[0].plot(sample.index, sample['rsi_filtered'], label='RSI Filtr√©', color='blue')
    axes[0].set_title('Signal Filtr√© (RSI avec Filtre d\'Octave)')
    axes[0].legend()
    axes[0].grid(True)

# Pente
if 'slope_shifted' in sample.columns:
    axes[1].plot(sample.index, sample['slope_shifted'], label='Pente D√©cal√©e', color='orange')
    axes[1].axhline(y=0, color='black', linestyle='--', alpha=0.5)
    axes[1].set_title('Pente du Signal Filtr√© (D√©cal√©e)')
    axes[1].legend()
    axes[1].grid(True)

# Labels
colors = ['red' if l == 0 else 'green' for l in sample['label']]
axes[2].scatter(sample.index, sample['label'], c=colors, alpha=0.6, s=20)
axes[2].set_title('Labels (0=Baisse, 1=Hausse)')
axes[2].set_xlabel('Index')
axes[2].set_yticks([0, 1])
axes[2].grid(True)

plt.tight_layout()
plt.show()

## 4. V√©rification du Data Leakage

In [None]:
from utils import check_data_leakage

# Liste des features (exclure les colonnes non-feature)
exclude_cols = ['timestamp', 'candle_30m_timestamp', 'label', 'slope', 'slope_shifted', 
                'rsi_filtered', 'close_filtered', 'step']

feature_cols = [col for col in df.columns if col not in exclude_cols]

print(f"V√©rification du data leakage sur {len(feature_cols)} features...")

leakage_results = check_data_leakage(df, feature_cols, label_col='label')

if leakage_results['suspicious_features']:
    print(f"\n‚ùå {len(leakage_results['suspicious_features'])} features suspectes d√©tect√©es:")
    for feat, corr in leakage_results['suspicious_features']:
        print(f"  - {feat}: corr√©lation {corr:.3f} avec label[t+1]")
else:
    print("\n‚úÖ Pas de data leakage d√©tect√©!")

In [None]:
# Visualiser les corr√©lations futures
future_corrs = leakage_results['future_correlation']

# Trier par valeur absolue
sorted_corrs = sorted(future_corrs.items(), key=lambda x: abs(x[1]), reverse=True)[:20]

features = [x[0] for x in sorted_corrs]
corrs = [x[1] for x in sorted_corrs]

plt.figure(figsize=(12, 6))
colors = ['red' if abs(c) > 0.7 else 'orange' if abs(c) > 0.5 else 'green' for c in corrs]
plt.barh(features, corrs, color=colors)
plt.axvline(x=0.7, color='red', linestyle='--', label='Seuil suspect (0.7)')
plt.axvline(x=-0.7, color='red', linestyle='--')
plt.xlabel('Corr√©lation avec label[t+1]')
plt.title('Top 20 Features - Corr√©lation Future (Leakage Check)')
plt.legend()
plt.tight_layout()
plt.show()

## 5. Qualit√© des Features Normalis√©es

In [None]:
# V√©rifier que les features normalis√©es ont mean~0 et std~1
normalized_cols = [col for col in df.columns if '_norm' in col]

print(f"Analyse de {len(normalized_cols)} features normalis√©es:\n")

stats = []
for col in normalized_cols[:10]:  # Afficher les 10 premi√®res
    values = df[col].dropna()
    stats.append({
        'feature': col,
        'mean': values.mean(),
        'std': values.std(),
        'min': values.min(),
        'max': values.max()
    })

stats_df = pd.DataFrame(stats)
print(stats_df.to_string(index=False))

In [None]:
# Distribution des features normalis√©es
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
axes = axes.flatten()

for i, col in enumerate(normalized_cols[:4]):
    df[col].dropna().hist(bins=50, ax=axes[i], edgecolor='black', alpha=0.7)
    axes[i].set_title(f'Distribution: {col}')
    axes[i].axvline(x=0, color='red', linestyle='--', label='Mean')
    axes[i].legend()

plt.tight_layout()
plt.show()

## 5b. Validation des Features Avanc√©es

**Features ajout√©es pour >90% accuracy:**
- Velocity features: velocity, amplitude, acceleration
- Log returns: ghost_high_log, ghost_low_log, ghost_close_log
- Open Z-Score: ghost_open_zscore (contexte de prix)
- Step index normalis√©: step_index_norm (0.0-1.0)

In [None]:
# V√©rifier les Log Returns et Open Z-Score
if 'ghost_high_log' in df.columns and 'ghost_low_log' in df.columns and 'ghost_close_log' in df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(14, 8))
    
    # Distribution des Log Returns
    df['ghost_high_log'].dropna().hist(bins=50, ax=axes[0,0], edgecolor='black', alpha=0.7, color='green')
    axes[0,0].set_title('Distribution: Ghost High Log Returns')
    axes[0,0].axvline(x=0, color='red', linestyle='--', alpha=0.5)
    
    df['ghost_low_log'].dropna().hist(bins=50, ax=axes[0,1], edgecolor='black', alpha=0.7, color='red')
    axes[0,1].set_title('Distribution: Ghost Low Log Returns')
    axes[0,1].axvline(x=0, color='red', linestyle='--', alpha=0.5)
    
    df['ghost_close_log'].dropna().hist(bins=50, ax=axes[1,0], edgecolor='black', alpha=0.7, color='blue')
    axes[1,0].set_title('Distribution: Ghost Close Log Returns')
    axes[1,0].axvline(x=0, color='red', linestyle='--', alpha=0.5)
    
    # Open Z-Score
    if 'ghost_open_zscore' in df.columns:
        df['ghost_open_zscore'].dropna().hist(bins=50, ax=axes[1,1], edgecolor='black', alpha=0.7, color='purple')
        axes[1,1].set_title('Distribution: Ghost Open Z-Score (Contexte Prix)')
        axes[1,1].axvline(x=0, color='red', linestyle='--', alpha=0.5)
        axes[1,1].axvline(x=-2, color='orange', linestyle='--', alpha=0.3, label='Survente')
        axes[1,1].axvline(x=2, color='orange', linestyle='--', alpha=0.3, label='Surachat')
        axes[1,1].legend()
    
    plt.tight_layout()
    plt.suptitle('Log Returns & Open Z-Score - Distributions', y=1.02, fontsize=14)
    plt.show()
    
    # V√©rifier que les log returns sont centr√©s autour de 0
    print("\\nüìä Statistiques des Log Returns:")
    for col in ['ghost_high_log', 'ghost_low_log', 'ghost_close_log']:
        values = df[col].dropna()
        print(f"{col}: mean={values.mean():.6f} (should be ~0), std={values.std():.6f}")
    
    if 'ghost_open_zscore' in df.columns:
        values = df['ghost_open_zscore'].dropna()
        print(f"\\nghost_open_zscore: mean={values.mean():.6f} (should be ~0), std={values.std():.6f} (should be ~1)")
else:
    print("‚ö†Ô∏è  Log returns non trouv√©s dans le dataset")

In [None]:
# Visualiser les Velocity Features pour une bougie √©chantillon
sample_candle = df[df['candle_30m_timestamp'] == df['candle_30m_timestamp'].iloc[200]].copy()

if 'velocity' in df.columns and 'amplitude' in df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(14, 8))
    
    # Velocity
    axes[0,0].plot(sample_candle['step'], sample_candle['velocity'], marker='o', color='purple')
    axes[0,0].set_title('Velocity (Vitesse de Formation)')
    axes[0,0].set_xlabel('Step')
    axes[0,0].axhline(y=0, color='black', linestyle='--', alpha=0.3)
    axes[0,0].grid(True, alpha=0.3)
    
    # Amplitude
    axes[0,1].plot(sample_candle['step'], sample_candle['amplitude'], marker='o', color='orange')
    axes[0,1].set_title('Amplitude (Volatilit√© Relative)')
    axes[0,1].set_xlabel('Step')
    axes[0,1].grid(True, alpha=0.3)
    
    # Acceleration
    if 'acceleration' in df.columns:
        axes[1,0].plot(sample_candle['step'], sample_candle['acceleration'], marker='o', color='red')
        axes[1,0].set_title('Acceleration (Variation entre Steps)')
        axes[1,0].set_xlabel('Step')
        axes[1,0].axhline(y=0, color='black', linestyle='--', alpha=0.3)
        axes[1,0].grid(True, alpha=0.3)
    
    # Step Index Normalized
    if 'step_index_norm' in df.columns:
        axes[1,1].plot(sample_candle['step'], sample_candle['step_index_norm'], marker='o', color='blue')
        axes[1,1].set_title('Step Index Normalis√© (0.0-1.0)')
        axes[1,1].set_xlabel('Step')
        axes[1,1].set_ylim(-0.1, 1.1)
        axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.suptitle('Features Avanc√©es - Dynamique de Formation', y=1.02, fontsize=14)
    plt.show()
else:
    print("‚ö†Ô∏è  Velocity features non trouv√©es dans le dataset")

In [None]:
# V√©rifier la pr√©sence des features avanc√©es
advanced_features = {
    'Velocity': ['velocity', 'amplitude', 'acceleration'],
    'Log Returns': ['ghost_high_log', 'ghost_low_log', 'ghost_close_log'],
    'Open Context': ['ghost_open_zscore'],
    'Step Normalized': ['step_index_norm']
}

print("üìä V√©rification des Features Avanc√©es:\n")
for category, features in advanced_features.items():
    print(f"{category}:")
    for feat in features:
        exists = feat in df.columns
        symbol = "‚úÖ" if exists else "‚ùå"
        print(f"  {symbol} {feat}")
        if exists:
            values = df[feat].dropna()
            print(f"      mean={values.mean():.4f}, std={values.std():.4f}, "
                  f"min={values.min():.4f}, max={values.max():.4f}")
    print()

## 6. Statistiques Descriptives

In [None]:
# Statistiques globales
df.describe()

In [None]:
# Valeurs manquantes
null_counts = df.isnull().sum()
null_pct = (null_counts / len(df) * 100).round(2)

null_df = pd.DataFrame({
    'column': null_counts.index,
    'null_count': null_counts.values,
    'null_pct': null_pct.values
})

null_df = null_df[null_df['null_count'] > 0].sort_values('null_count', ascending=False)

print("Colonnes avec valeurs manquantes:")
print(null_df.to_string(index=False))

if len(null_df) == 0:
    print("\n‚úÖ Aucune valeur manquante!")

## 7. Pr√™t pour l'Entra√Ænement

In [None]:
# Supprimer les lignes avec label=NaN
df_clean = df.dropna(subset=['label'])

print(f"Dataset nettoy√©: {len(df_clean)} lignes ({len(df_clean)/len(df)*100:.1f}% du total)")
print(f"Colonnes: {len(df_clean.columns)}")

# S√©parer features et label
feature_cols_final = [col for col in df_clean.columns 
                     if col not in ['timestamp', 'candle_30m_timestamp', 'label', 
                                   'slope', 'slope_shifted', 'rsi_filtered', 'close_filtered']]

X = df_clean[feature_cols_final]
y = df_clean['label']

print(f"\nFeatures (X): {X.shape}")
print(f"Labels (y): {y.shape}")
print(f"\n‚úÖ Dataset pr√™t pour l'entra√Ænement!")

In [None]:
# Sauvegarder une version nettoy√©e
output_clean = '../data/processed/btc_30m_dataset_clean.csv'
df_clean.to_csv(output_clean, index=False)
print(f"Dataset nettoy√© sauvegard√©: {output_clean}")

## R√©sum√© de la Validation

‚úÖ **Checklist:**
- [ ] Bougie fant√¥me correctement form√©e (6 steps)
- [ ] Int√©grit√© OHLC valid√©e
- [ ] Labels √©quilibr√©s (40-60%)
- [ ] Pas de data leakage d√©tect√©
- [ ] Features normalis√©es correctement
- [ ] Dataset pr√™t pour entra√Ænement

**Prochaines √©tapes:**
1. Cr√©er le mod√®le CNN-LSTM/TCN
2. Entra√Ænement avec GPU
3. Validation crois√©e temporelle
4. Backtesting