In [None]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
import os
import numpy as np

# Prevents'NoneType' object has no attribute 'split' error
os.environ['OMP_NUM_THREADS'] = '1' 

# configurations
FILE_PATH = 'your_file.csv'  # Path to your CSV file
K_MAX = 15 
N_INIT = 10  # Number of initializations to run for better results
MAX_ITER = 100

# pytorch k-means implementation

class KMeansTorch:
    """K-Means clustering implemented using PyTorch tensors."""
    def __init__(self, n_clusters, max_iter=100, n_init=10, random_state=42):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.n_init = n_init
        self.random_state = random_state
        self.inertia_ = None
        self.cluster_centers_ = None

    def _initialize_centroids(self, X):
        """Randomly selects initial centroids from the data points."""
        np.random.seed(self.random_state)
        rand_indices = np.random.choice(X.shape[0], self.n_clusters, replace=False)
        return X[rand_indices]

    def _fit_single(self, X):
        """Performs a single run of the K-Means algorithm."""
        # Initialize centroids
        centroids = self._initialize_centroids(X).clone().to(X.device)
        
        for _ in range(self.max_iter):
            # E-Step: Compute distances and assign clusters
            # Compute squared Euclidean distances
            distances = torch.sum((X.unsqueeze(1) - centroids.unsqueeze(0))**2, dim=2)
            
            # Assign clusters based on closest centroid
            cluster_assignments = torch.argmin(distances, dim=1)
            
            # M-Step: Update centroids
            new_centroids = torch.zeros_like(centroids)
            counts = torch.zeros(self.n_clusters, dtype=torch.int64)
            
            # Using scatter_add to sum points in each cluster
            new_centroids.scatter_add_(0, cluster_assignments.view(-1, 1).repeat(1, X.shape[1]), X)
        
            counts = torch.bincount(cluster_assignments, minlength=self.n_clusters)
            
            # Avoid division by zero
            valid_counts = counts.view(-1, 1).clamp(min=1).float()
            new_centroids /= valid_counts
            
            # Check for convergence
            if torch.allclose(centroids, new_centroids):
                break
                
            centroids = new_centroids

        # Calculate inertia (WCSS)
        final_distances = torch.sum((X.unsqueeze(1) - centroids.unsqueeze(0))**2, dim=2)
        inertia = final_distances.min(dim=1)[0].sum().item()
        
        return inertia, centroids

    def fit(self, X):
        """Runs K-Means multiple times and selects the best result."""
        X_tensor = torch.tensor(X, dtype=torch.float32)
        best_inertia = float('inf')
        best_centroids = None
        
        for init in range(self.n_init):
            # Set different random state for each initialization
            self.random_state = 42 + init 
            inertia, centroids = self._fit_single(X_tensor)
            
            if inertia < best_inertia:
                best_inertia = inertia
                best_centroids = centroids
                
        self.inertia_ = best_inertia
        self.cluster_centers_ = best_centroids.numpy()
        return self

# Load Data and Prepare for Clustering
try:
    df = pd.read_csv(FILE_PATH)
except FileNotFoundError:
    print(f"Error: The file at '{FILE_PATH}' was not found.")
    exit()

weight_cols = [col for col in df.columns if col.endswith('_weight')]
if not weight_cols:
    print("Error: No columns ending with '_weight' were found in the CSV.")
    exit()
    
X = df[weight_cols].values
print(f"Successfully loaded {len(X)} rows with {len(weight_cols)} weight features.")

wcss = []
K_range = range(2, K_MAX + 1)

print(f"Calculating WCSS for K = 2 to {K_MAX} using {N_INIT} initializations...")

for k in K_range:
    kmeans = KMeansTorch(n_clusters=k, n_init=N_INIT, max_iter=MAX_ITER)
    kmeans.fit(X) 
    wcss.append(kmeans.inertia_)

# Plotting the Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(K_range, wcss, marker='o', linestyle='-', color='darkcyan')
plt.title(f'Elbow Method for Optimal K (PyTorch Implementation)', fontsize=16)
plt.xlabel('Number of Clusters (K)', fontsize=12)
plt.ylabel('WCSS (Within-Cluster Sum of Squares)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.xticks(K_range)

print("\nPlotting results. Look for the 'elbow' where the curve bends sharply.")
plt.show()

print("\n--- Next Steps ---")
print("Select the optimal K from the Elbow plot and run the final K-MeansTorch model with that K.")

In [10]:
import pandas as pd
import torch
import numpy as np
import os

# Prevents'NoneType' object has no attribute 'split' error
os.environ['OMP_NUM_THREADS'] = '1' 

# configurations
FILE_PATH = '/Users/drewrogers/Desktop/ETFsAndOT/etf_nav_and_weights.csv'
K_FINAL = 9         # The chosen number of clusters (regimes)
N_INIT = 10         # Number of initializations
MAX_ITER = 100

# PyTorch K-Means implementation

class KMeansTorch:
    """K-Means clustering implemented using PyTorch tensors."""
    def __init__(self, n_clusters, max_iter=100, n_init=10, random_state=42):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.n_init = n_init
        self.random_state = random_state
        self.inertia_ = None
        self.cluster_centers_ = None
        self.labels_ = None # To store cluster assignments

    def _initialize_centroids(self, X):
        """Randomly selects initial centroids from the data points."""
        # Using numpy for random selection
        np.random.seed(self.random_state)
        rand_indices = np.random.choice(X.shape[0], self.n_clusters, replace=False)
        return X[rand_indices]

    def _fit_single(self, X):
        """Performs a single run of the K-Means algorithm."""
        centroids = self._initialize_centroids(X).clone().to(X.device)
        cluster_assignments = None

        for _ in range(self.max_iter):
            # E-Step: Assign clusters based on closest centroid
            # Compute squared Euclidean distances
            distances = torch.sum((X.unsqueeze(1) - centroids.unsqueeze(0))**2, dim=2)
            cluster_assignments = torch.argmin(distances, dim=1)
            
            # M-Step: Update centroids
            new_centroids = torch.zeros_like(centroids)
            
            # Use scatter_add_ to sum all points belonging to a cluster
            new_centroids.scatter_add_(0, cluster_assignments.view(-1, 1).repeat(1, X.shape[1]), X)
            
            # Count the number of points in each cluster
            counts = torch.bincount(cluster_assignments, minlength=self.n_clusters)
            
            # Avoid division by zero
            valid_counts = counts.view(-1, 1).clamp(min=1).float()
            new_centroids /= valid_counts
            
            # Check for convergence
            if torch.allclose(centroids, new_centroids):
                break
                
            centroids = new_centroids

        # Calculate inertia (WCSS)
        final_distances = torch.sum((X.unsqueeze(1) - centroids.unsqueeze(0))**2, dim=2)
        inertia = final_distances.min(dim=1)[0].sum().item()
        
        return inertia, centroids, cluster_assignments

    def fit_predict(self, X):
        """Runs K-Means multiple times and selects the best result."""
        X_tensor = torch.tensor(X, dtype=torch.float32)
        best_inertia = float('inf')
        
        for init in range(self.n_init):
            self.random_state = 42 + init 
            inertia, centroids, assignments = self._fit_single(X_tensor)
            
            if inertia < best_inertia:
                best_inertia = inertia
                self.cluster_centers_ = centroids.numpy()
                self.labels_ = assignments.numpy()
                
        self.inertia_ = best_inertia
        return self.labels_


# Load Data and Prepare for Clustering

try:
    df = pd.read_csv(FILE_PATH)
except FileNotFoundError:
    print(f"Error: The file at '{FILE_PATH}' was not found. Please double-check the path.")
    exit()

weight_cols = [col for col in df.columns if col.endswith('_weight')]
if not weight_cols:
    print("Error: No columns ending with '_weight' were found in the CSV.")
    exit()
    
X = df[weight_cols].values

# Run the final K-Means model with K=9
kmeans_final = KMeansTorch(n_clusters=K_FINAL, n_init=N_INIT, max_iter=MAX_ITER)
cluster_labels = kmeans_final.fit_predict(X)

print(f"Clustering complete. Identified {K_FINAL} portfolio regimes.")
print(f"Total inertia (WCSS) for K={K_FINAL}: {kmeans_final.inertia_:.4f}\n")


# summarize and Identify Most Common State

centers_df = pd.DataFrame(kmeans_final.cluster_centers_, columns=weight_cols)
centers_df.index.name = 'Cluster'

# Add cluster labels to the original DataFrame
df['Cluster'] = cluster_labels

# Calculate the size (frequency) of each regime
cluster_sizes = df.groupby('Cluster').size().rename('Size (Days)')
summary = centers_df.join(cluster_sizes)

# Identify the most frequent regime
most_common_id = summary['Size (Days)'].idxmax()
most_common_state = summary.loc[most_common_id]

# Print the summary and most common regime details

print(f"--- Portfolio Regime Analysis (K={K_FINAL}) ---")
print(f"The most common regime is Cluster {most_common_id}, observed on {most_common_state['Size (Days)']} days.")
print("-" * 50)

print("\n**Most Common Portfolio Regime Weights**")
print(f"(Cluster {most_common_id} Centroid - Weights in %):")
print("-" * 40)

# Extract and format the weights for the largest cluster's centroid
most_common_weights = most_common_state[weight_cols] * 100
most_common_weights.index = [col.replace('_weight', '').upper() for col in weight_cols]

# Print weights in the requested format
for etf, weight in most_common_weights.items():
    print(f"{etf}: {weight:.2f}%")

print("-" * 40)

Clustering complete. Identified 9 portfolio regimes.
Total inertia (WCSS) for K=9: 0.9411

--- Portfolio Regime Analysis (K=9) ---
The most common regime is Cluster 1, observed on 381.0 days.
--------------------------------------------------

**Most Common Portfolio Regime Weights**
(Cluster 1 Centroid - Weights in %):
----------------------------------------
XLK: 18.33%
XLF: 13.44%
XLV: 16.78%
XLY: 6.53%
XLP: 7.10%
XLE: 16.24%
XLI: 5.87%
XLU: 6.69%
XLRE: 2.06%
XLB: 2.47%
XLC: 4.49%
----------------------------------------
