In [1]:
import argparse
import time
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, fcluster, linkage

# Import all functions from cluster_assets.py
from cluster_assets import (
    load_all_prices,
    handle_missing_values, 
    compute_daily_returns,
    compute_correlation_and_distance,
    ward_hierarchical_clustering,
    plot_and_save_dendrogram,
    save_correlation_matrix,
    save_clusters
)

# Parameters that would normally come from CLI args
input_dir = Path("data/raw")
output_dir = Path("results") 
n_clusters = 10
missing = "drop"
min_valid_ratio = 0.8
dendrogram_filename = "dendrogram.png"

In [2]:
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)

print("Chargement des prix...")
prices = load_all_prices(input_dir)
print(f"- {prices.shape[1]} tickers chargés, {prices.shape[0]} dates brutes")

print("Gestion des valeurs manquantes...")
prices = handle_missing_values(
    prices, missing=missing, min_valid_ratio=min_valid_ratio
)
print(f"- Après nettoyage: {prices.shape[1]} tickers, {prices.shape[0]} dates")

print("Calcul des rendements journaliers...")
returns = compute_daily_returns(prices)
print(f"- Matrice rendements: {returns.shape[0]} jours x {returns.shape[1]} tickers")

print("Calcul de la corrélation et de la distance...")
corr, dist = compute_correlation_and_distance(returns)
save_correlation_matrix(corr, output_dir / "correlation_matrix.csv")

print("Clustering hiérarchique (Ward)...")
Z, cluster_labels = ward_hierarchical_clustering(returns, n_clusters=n_clusters)

print("Génération du dendrogramme...")
tickers: List[str] = list(returns.columns)
plot_and_save_dendrogram(
    Z,
    labels=tickers,
    n_clusters=n_clusters,
    output_path=output_dir / dendrogram_filename,
)

print("Sauvegarde des clusters...")
clusters_df = save_clusters(
    tickers=tickers, labels=cluster_labels, output_path=output_dir / "clusters.csv"
)

# Display cluster sizes
print("Tailles des clusters:")
for cluster_id, size in clusters_df["cluster"].value_counts().sort_index().items():
    print(f"- Cluster {cluster_id}: {int(size)} tickers")

# Show the dendrogram
plt.show()


Chargement des prix...
- 501 tickers chargés, 2516 dates brutes
Gestion des valeurs manquantes...
- Après nettoyage: 476 tickers, 2047 dates
Calcul des rendements journaliers...
- Matrice rendements: 2046 jours x 476 tickers
Calcul de la corrélation et de la distance...
Clustering hiérarchique (Ward)...
Génération du dendrogramme...
Sauvegarde des clusters...
Tailles des clusters:
- Cluster 1: 46 tickers
- Cluster 2: 179 tickers
- Cluster 3: 19 tickers
- Cluster 4: 30 tickers
- Cluster 5: 1 tickers
- Cluster 6: 28 tickers
- Cluster 7: 102 tickers
- Cluster 8: 27 tickers
- Cluster 9: 3 tickers
- Cluster 10: 41 tickers
