In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np

# Normalize the data excluding the 'Time_Index' column
scaler = StandardScaler()
df_tot_normalized = scaler.fit_transform(df_tot.drop(columns=['Time_Index']))
df_tot_normalized_df = pd.DataFrame(df_tot_normalized, columns=df_tot.columns.drop('Time_Index'))
df_tot_norm_w_tindex = pd.concat([df_tot[['Time_Index']].reset_index(drop=True), df_tot_normalized_df], axis=1)

# Initialize dictionary to store DataFrames for each epsilon
consecutive_hours_dfs = {}

# Epsilon values to test
epsilon_values = [0.8, 0.2]

for eps in epsilon_values:
    # Apply DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=2)
    labels = dbscan.fit_predict(df_tot_normalized)
    
    # Create a DataFrame to store clustering results for this epsilon
    df_epsilon = df_tot.copy()
    df_epsilon['Cluster'] = labels
    
    # Identify clusters with at least one consecutive pair of hours
    valid_clusters = []
    consecutive_rows = []  # Store rows with consecutive hours for each epsilon
    for label in set(labels):
        if label != -1:  # Exclude noise points (-1)
            cluster_group = df_epsilon[df_epsilon['Cluster'] == label]
            consecutive_indices = []

            # Check for consecutive hours
            for i in range(len(cluster_group) - 1):
                if cluster_group['Time_Index'].iloc[i] + 1 == cluster_group['Time_Index'].iloc[i + 1]:
                    consecutive_indices.append(cluster_group.index[i])  # Add current index
                    # Add the last consecutive index in the pair
                    if i == len(cluster_group) - 2:
                        consecutive_indices.append(cluster_group.index[i + 1])

            # If there are consecutive indices, save the cluster as valid
            if consecutive_indices:
                valid_clusters.append(label)
                consecutive_rows.extend(consecutive_indices)

    # Filter the DataFrame to include only rows with consecutive hours and valid clusters
    df_consecutive = df_epsilon.loc[consecutive_rows].copy()
    
    # Store the DataFrame in the dictionary with epsilon as key
    consecutive_hours_dfs[eps] = df_consecutive

# Display the DataFrames for epsilon 0.8 and 0.2
print("Consecutive hours DataFrame for epsilon = 0.8:")
print(consecutive_hours_dfs[0.8])

print("\nConsecutive hours DataFrame for epsilon = 0.2:")
print(consecutive_hours_dfs[0.2])


## Plot the frequency over cluster size and variance for clusters that contain consecutive hours

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Normalize the data excluding the 'Time_Index' column
scaler = StandardScaler()
df_tot_normalized = scaler.fit_transform(df_tot.drop(columns=['Time_Index']))
df_tot_normalized_df = pd.DataFrame(df_tot_normalized, columns=df_tot.columns.drop('Time_Index'))
df_tot_norm_w_tindex = pd.concat([df_tot[['Time_Index']].reset_index(drop=True), df_tot_normalized_df], axis=1)

# Initialize list to store results and DataFrames for each epsilon
epsilon_results = []
consecutive_hours_dfs = {}  # To store DataFrames for consecutive hours for each epsilon

# Epsilon values to test
epsilon_values = [0.8, 0.2]

for eps in epsilon_values:
    # Apply DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=2)
    labels = dbscan.fit_predict(df_tot_normalized)
    
    # Create a DataFrame to store clustering results for this epsilon
    df_epsilon = df_tot.copy()
    df_epsilon['Cluster'] = labels
    
    # Identify clusters with at least one consecutive pair of hours
    valid_clusters = []
    consecutive_rows = []  # Store rows with consecutive hours for each epsilon
    for label in set(labels):
        if label != -1:  # Exclude noise points (-1)
            cluster_group = df_epsilon[df_epsilon['Cluster'] == label]
            consecutive_indices = []

            # Check for consecutive hours
            for i in range(len(cluster_group) - 1):
                if cluster_group['Time_Index'].iloc[i] + 1 == cluster_group['Time_Index'].iloc[i + 1]:
                    consecutive_indices.append(cluster_group.index[i])  # Add current index
                    # Add the last consecutive index in the pair
                    if i == len(cluster_group) - 2:
                        consecutive_indices.append(cluster_group.index[i + 1])

            # If there are consecutive indices, save the cluster as valid
            if consecutive_indices:
                valid_clusters.append(label)
                consecutive_rows.extend(consecutive_indices)

    # Filter the DataFrame to include only rows with consecutive hours and valid clusters
    df_consecutive = df_epsilon.loc[consecutive_rows].copy()
    
    # Store the DataFrame in the dictionary with epsilon as key
    consecutive_hours_dfs[eps] = df_consecutive

    # Filter for plotting
    filtered_df = df_epsilon[df_epsilon['Cluster'].isin(valid_clusters)]

    # Collect cluster sizes and variances for valid clusters
    cluster_sizes = filtered_df['Cluster'].value_counts()
    cluster_variances = filtered_df.groupby('Cluster').var().mean(axis=1)

    # Plot the results
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    fig.suptitle(f'DBSCAN Analysis for epsilon = {eps} (Clusters with Consecutive Hours)', fontsize=16)

    # Plot Cluster Size Frequency
    sns.histplot(cluster_sizes, kde=False, bins=35, ax=axes[0], log_scale=(True, False))
    axes[0].set_title('Cluster Size Frequency (Log Scale)')
    axes[0].set_xlabel('Cluster Size')
    axes[0].set_ylabel('Frequency')
    axes[0].grid(True)
    total_clusters = len(cluster_sizes)
    axes[0].text(
        0.95, 0.95,
        f'Total Clusters with Consecutive Hours: {total_clusters}',
        transform=axes[0].transAxes,
        fontsize=10,
        color='black',
        ha='right',
        va='top',
        bbox=dict(boxstyle="round,pad=0.3", edgecolor="gray", facecolor="white", alpha=0.8)
    )

    # Plot Cluster Variance Frequency
    sns.histplot(cluster_variances, kde=False, bins=35, ax=axes[1], log_scale=(True, False))
    axes[1].set_title('Avg. Variance within Clusters (Log Scale)')
    axes[1].set_xlabel('Cluster Avg. Variance')
    axes[1].set_ylabel('Frequency')
    axes[1].grid(True)

    # Display the figure
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()
    
    # Store the results for each epsilon
    result = {
        'Epsilon': eps,
        'Num_Clusters_With_Consecutive_Hours': len(cluster_sizes),
        'Mean_Cluster_Variance': cluster_variances.mean()
    }
    epsilon_results.append(result)

# Create a DataFrame to display the summary results for each epsilon
epsilon_summary_df = pd.DataFrame(epsilon_results)
print(epsilon_summary_df)

# Display each DataFrame for consecutive hours by epsilon
for eps, df_consecutive in consecutive_hours_dfs.items():
    print(f"\nConsecutive hours DataFrame for epsilon = {eps}:")
    print(df_consecutive)
