In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import squareform
from sklearn.preprocessing import StandardScaler
from tslearn.metrics import cdist_dtw
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import silhouette_score

COLORS = {
    'color1': '#1f77b4',
    'color2': '#ff7f0e',
    'color3': '#2ca02c',
    'color4': '#d62728',
    'color5': '#9467bd',
    'color6': '#8c564b',
    'color7': '#e377c2',
    'color8': '#7f7f7f',
    'color9': '#bcbd22',
    'color10': '#17becf',
}

datasets = ['/content/drive/MyDrive/transformed_data/amazon_buy_box_pricing.csv',
            '/content/drive/MyDrive/transformed_data/amazon_pricing.csv',
            '/content/drive/MyDrive/transformed_data/lowest_internet_final_df.csv']

price_cols = ['buy_box_price', 'amazon_price', 'lowest_internet_price']

base_path = "/content/drive/MyDrive/hierarchical_clustering_results"
os.makedirs(base_path, exist_ok=True)

for m in range(3):
    price_col = price_cols[m]
    daily_metrics = {
        'amazon_frequency': f'amazon_frequency',
        'amazon_realized_variance': f'amazon_realized_variance',
        'amazon_intraday_range': f'amazon_intraday_range'
    }

    # Load and filter data
    df = pd.read_csv(datasets[m])
    df['date'] = pd.to_datetime(df['date'])
    df = df[(df['date'] >= '2024-09-01') & (df['date'] < '2025-01-01')]

    # Create output directory
    dataset_path = os.path.join(base_path, f"amazon_hierarchical")
    os.makedirs(dataset_path, exist_ok=True)

    metrics_to_use = list(daily_metrics.values())
    unique_key = 'query'

    # Normalize data per product
    normalized_data = []
    for product in df[unique_key].unique():
        product_data = df[df[unique_key] == product].copy()
        if len(product_data) > 1:
            scaler = StandardScaler()
            for metric in metrics_to_use:
                if metric in product_data.columns and product_data[metric].std() > 0:
                    product_data[metric] = scaler.fit_transform(product_data[[metric]])

        normalized_data.append(product_data)

    df_normalized = pd.concat(normalized_data)
    df_normalized.drop_duplicates(inplace=True)

    # Create time series matrix
    products = df_normalized[unique_key].unique()
    time_points = sorted(df_normalized['date'].unique())

    X = np.zeros((len(products), len(time_points), len(metrics_to_use)))

    for k, metric in enumerate(metrics_to_use):
        pivot = df_normalized.pivot(index=unique_key, columns='date', values=metric)
        pivot = pivot.reindex(index=products, columns=time_points, fill_value=0)
        X[:, :, k] = pivot.values

    X = np.nan_to_num(X, 0)

    print(f"\nComputing DTW distance matrix for {len(products)} products...")

    # Compute DTW distance matrix
    dtw_dist_matrix = cdist_dtw(X, X, n_jobs=-1)

    # Convert to condensed form for scipy
    condensed_dist = squareform(dtw_dist_matrix)

    # Try different linkage methods
    linkage_methods = ['ward']
    results = []

    for method in linkage_methods:
        print(f"Testing {method} linkage...")

        # Perform hierarchical clustering
        if method == 'ward':
            # Ward requires squared distances
            Z = linkage(condensed_dist**2, method=method)
        else:
            Z = linkage(condensed_dist, method=method)

        # Test different numbers of clusters
        silhouette_scores = {}
        for n_clusters in range(2, min(10, len(products))):
            clusters = fcluster(Z, n_clusters, criterion='maxclust')

            # Calculate silhouette score
            if len(np.unique(clusters)) > 1:
                sil_score = silhouette_score(dtw_dist_matrix, clusters, metric='precomputed')
                silhouette_scores[n_clusters] = sil_score

        if silhouette_scores:
            best_n = max(silhouette_scores, key=silhouette_scores.get)
            best_score = silhouette_scores[best_n]

            results.append({
                'method': method,
                'best_n_clusters': best_n,
                'silhouette_score': best_score,
                'all_scores': silhouette_scores,
                'linkage_matrix': Z
            })