# Read DataFrame have "PVtot_correct (kW)" or "Irradiance (W/m2)"

In [1]:
import pandas as pd

merge_df = pd.read_csv('../processed_data/pv_8kW_5minresample_concat.csv', parse_dates=['datetime'], index_col='datetime')
merge_df = merge_df.between_time('07:00:00','18:00:00')
merge_df['Date'] = merge_df.index.date
merge_df

Unnamed: 0_level_0,PVtot (kW),Irradiance (W/m2),alpha (m2),k,PVtot_correct (kW),Date
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01 07:00:00,,31.4,,1.000000,,2023-01-01
2023-01-01 07:05:00,,38.8,,1.000000,,2023-01-01
2023-01-01 07:10:00,,42.8,,1.000000,,2023-01-01
2023-01-01 07:15:00,,46.6,,1.000000,,2023-01-01
2023-01-01 07:20:00,,49.0,,1.000000,,2023-01-01
...,...,...,...,...,...,...
2024-12-31 17:40:00,0.016760,51.6,2.738367,1.776434,0.029773,2024-12-31
2024-12-31 17:45:00,0.055212,40.6,2.738367,1.776434,0.098080,2024-12-31
2024-12-31 17:50:00,0.067134,30.2,2.738367,1.776434,0.119260,2024-12-31
2024-12-31 17:55:00,0.067570,19.4,2.738367,1.776434,0.120034,2024-12-31


# Calculate daily "PVtot_correct_avg (kW)" and "Irradiance_avg (W/m2)"

In [2]:
dailymean_merge_df = merge_df.groupby('Date').mean()
dailymean_merge_df = dailymean_merge_df.reset_index()
dailymean_merge_df.rename(columns={'PVtot (kW)':'PVtot_avg (kW)', 'Irradiance (W/m2)':'Irradiance_avg (W/m2)', 'PVtot_correct (kW)':'PVtot_correct_avg (kW)'}, inplace=True)
dailymean_merge_df

Unnamed: 0,Date,PVtot_avg (kW),Irradiance_avg (W/m2),alpha (m2),k,PVtot_correct_avg (kW)
0,2023-01-01,,438.073684,,1.000000,
1,2023-01-02,,366.290226,,1.000000,
2,2023-01-03,,257.353383,,1.000000,
3,2023-01-04,,384.810526,,1.000000,
4,2023-01-05,,434.398496,,1.000000,
...,...,...,...,...,...,...
718,2024-12-27,0.917370,385.640602,2.674778,1.776434,1.629647
719,2024-12-28,0.902674,379.111278,2.686129,1.776434,1.603541
720,2024-12-29,0.892041,379.986466,2.768692,1.776434,1.584653
721,2024-12-30,1.099271,448.264662,2.696047,1.776434,1.952784


# Cluster PV

## 2-D Cluster

In [3]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
import numpy as np

# Select two variables from dailymean_merge_df
var1 = 'Irradiance_avg (W/m2)'
var2 = 'PVtot_correct_avg (kW)'
n_clusters = 3
X_2D = dailymean_merge_df[[var1, var2, 'Date']].dropna()  # Drop rows with NaN values

# Apply KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=103)
kmeans.fit(X_2D[[var1, var2]])

# Add cluster labels to the DataFrame
X_2D['Cluster'] = kmeans.labels_

# Calculate the norm of cluster centers and sort clusters by their norm
norms = np.linalg.norm(kmeans.cluster_centers_, axis=1)
sorted_indices = np.argsort(norms)

# Map old cluster labels to new ones based on the sorted order
label_mapping = {old_label: new_label for new_label, old_label in enumerate(sorted_indices)}
X_2D['Cluster'] = X_2D['Cluster'].map(label_mapping)

# Update cluster centers to reflect the new labels
centers = pd.DataFrame(kmeans.cluster_centers_[sorted_indices], columns=[var1, var2])
centers['Cluster'] = ['Cluster 0', 'Cluster 1', 'Cluster 2']

print(X_2D)

# Create a subplot figure with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=[
        f"Fitted clusters of daily-averaged values of {var1} and {var2}",
        f"Histogram of daily-averaged {var1}"
    ]
)

# Subplot 1: Scatter plot with clusters
scatter = px.scatter(
    X_2D,
    x=var1,
    y=var2,
    color='Cluster',
    labels={var1: f'daily-averaged {var1}', var2: f'daily-averaged {var2}'},
    hover_data={'Date': True, var1: True, var2: True},
).data

for trace in scatter:
    fig.add_trace(trace, row=1, col=1)

# Add cluster centers to the scatter plot
fig.add_trace(
    go.Scatter(
        x=centers[var1],
        y=centers[var2],
        mode='markers',
        marker=dict(size=10, color='red', symbol='x'),
        name='Centroids'
    ),
    row=1, col=1
)

# Subplot 2: Histogram of Irradiance
fig.add_trace(
    go.Histogram(
        x=X_2D[var1],
        name=f'Histogram of {var1}',
        marker_color='blue',
        opacity=0.75
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(
    template='plotly_white',
    width=1500,
    height=600,
    showlegend=True,
    xaxis_title=f'daily-averaged {var1}',
    yaxis_title=f'daily-averaged {var2}',
    xaxis2_title=f'daily-averaged {var1}',
    yaxis2_title='Frequency',
)

fig.show()




     Irradiance_avg (W/m2)  PVtot_correct_avg (kW)        Date  Cluster
65              375.075188                1.734432  2023-03-10        1
66              365.514286                1.701585  2023-03-11        1
67              364.151880                1.676070  2023-03-12        1
68              370.514286                1.776015  2023-03-13        1
69              495.736842                2.413711  2023-03-14        2
..                     ...                     ...         ...      ...
718             385.640602                1.629647  2024-12-27        1
719             379.111278                1.603541  2024-12-28        1
720             379.986466                1.584653  2024-12-29        1
721             448.264662                1.952784  2024-12-30        1
722             443.425564                1.936768  2024-12-31        1

[538 rows x 4 columns]


## 1-D Cluster

In [4]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
import numpy as np

# Select the variable for clustering
var1 = 'PVtot_correct_avg (kW)'  # Replace 'P' with the actual column name in your DataFrame
n_clusters = 3
X_1D = dailymean_merge_df[[var1, 'Date']].dropna()  # Drop rows with NaN values

# Apply KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=103)
kmeans.fit(X_1D[[var1]])

# Add cluster labels to the DataFrame
X_1D['Cluster'] = kmeans.labels_

# Calculate the norm of cluster centers and sort clusters by their value (since it's 1D)
centers = pd.DataFrame(kmeans.cluster_centers_, columns=[var1])
centers['Norm'] = centers[var1].abs()
sorted_indices = centers.sort_values(by='Norm').index

# Map old cluster labels to new ones based on the sorted order
label_mapping = {old_label: new_label for new_label, old_label in enumerate(sorted_indices)}
X_1D['Cluster'] = X_1D['Cluster'].map(label_mapping)

# Update cluster centers to reflect the new labels
centers = centers.loc[sorted_indices].reset_index(drop=True)
centers['Cluster'] = [f'Cluster {i}' for i in range(n_clusters)]

# Add a column for y-values (all zeros)
X_1D['y'] = 0
centers['y'] = 0

# Create a subplot figure with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=[
        f"Fitted clusters of daily-averaged {var1}",
        f"Histogram of daily-averaged {var1}"
    ]
)

# Subplot 1: Scatter plot with clusters
scatter = px.scatter(
    X_1D,
    x=var1,
    y='y',
    color='Cluster',
    labels={var1: f'daily-averaged {var1}', 'y': ''},
    hover_data={'Date': True, var1: True},
).data

for trace in scatter:
    fig.add_trace(trace, row=1, col=1)

# Add cluster centers to the scatter plot
fig.add_trace(
    go.Scatter(
        x=centers[var1],
        y=centers['y'],
        mode='markers',
        marker=dict(size=10, color='red', symbol='x'),
        name='Centroids'
    ),
    row=1, col=1
)

# Subplot 2: Histogram of the selected variable
fig.add_trace(
    go.Histogram(
        x=X_1D[var1],
        name=f'Histogram of {var1}',
        marker_color='blue',
        opacity=0.75
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(
    template='plotly_white',
    width=1500,
    height=600,
    showlegend=True,
    xaxis_title=f'daily-averaged {var1}',
    yaxis_title='',
    xaxis2_title=f'daily-averaged {var1}',
    yaxis2_title='Frequency',
)

X_1D.drop(columns='y', inplace=True)
centers.drop(columns='y', inplace=True)

print(X_1D)

fig.show()




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



     PVtot_correct_avg (kW)        Date  Cluster
65                 1.734432  2023-03-10        1
66                 1.701585  2023-03-11        1
67                 1.676070  2023-03-12        1
68                 1.776015  2023-03-13        1
69                 2.413711  2023-03-14        2
..                      ...         ...      ...
718                1.629647  2024-12-27        1
719                1.603541  2024-12-28        1
720                1.584653  2024-12-29        1
721                1.952784  2024-12-30        1
722                1.936768  2024-12-31        1

[607 rows x 3 columns]


## Merge cluster 1-D and 2-D

- If cluster_PI is NaN -> Cluster from Cluster_P
- else -> Cluster from Cluster_PI

In [5]:
cluster_df = X_1D.merge(X_2D[['Date', 'Cluster', 'Irradiance_avg (W/m2)']], how='outer', on='Date', suffixes=('_P', '_PI'))
cluster_df['Cluster'] = np.where(cluster_df['Cluster_PI'].isna(), cluster_df['Cluster_P'], cluster_df['Cluster_PI'])
cluster_df[['Cluster_P', 'Cluster_PI', 'Cluster']] = cluster_df[['Cluster_P', 'Cluster_PI', 'Cluster']].map(lambda x: int(x) if pd.notna(x) else x)
cluster_df = cluster_df[['Date', 'PVtot_correct_avg (kW)', 'Irradiance_avg (W/m2)', 'Cluster_P', 'Cluster_PI', 'Cluster']]
cluster_df

Unnamed: 0,Date,PVtot_correct_avg (kW),Irradiance_avg (W/m2),Cluster_P,Cluster_PI,Cluster
0,2023-03-10,1.734432,375.075188,1,1.0,1
1,2023-03-11,1.701585,365.514286,1,1.0,1
2,2023-03-12,1.676070,364.151880,1,1.0,1
3,2023-03-13,1.776015,370.514286,1,1.0,1
4,2023-03-14,2.413711,495.736842,2,2.0,2
...,...,...,...,...,...,...
602,2024-12-27,1.629647,385.640602,1,1.0,1
603,2024-12-28,1.603541,379.111278,1,1.0,1
604,2024-12-29,1.584653,379.986466,1,1.0,1
605,2024-12-30,1.952784,448.264662,1,1.0,1


## Save Cluster PV

In [6]:
import os

save_df = cluster_df[['Date', 'PVtot_correct_avg (kW)', 'Cluster']].copy()

save_df.rename(columns={'PVtot_correct_avg (kW)':'PV_avg (kW)','Cluster':'PV_Cluster'}, inplace=True)
print(save_df)

output_dir = '../processed_data/'
output_file = 'pv_8kW_cluster.csv'
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, output_file)
save_df.to_csv(output_path, index=False)

print(f"Dataframe saved to {output_path}")

           Date  PV_avg (kW)  PV_Cluster
0    2023-03-10     1.734432           1
1    2023-03-11     1.701585           1
2    2023-03-12     1.676070           1
3    2023-03-13     1.776015           1
4    2023-03-14     2.413711           2
..          ...          ...         ...
602  2024-12-27     1.629647           1
603  2024-12-28     1.603541           1
604  2024-12-29     1.584653           1
605  2024-12-30     1.952784           1
606  2024-12-31     1.936768           1

[607 rows x 3 columns]
Dataframe saved to ../processed_data/pv_8kW_cluster.csv


## Box plot each cluter

In [7]:
fig = px.box(cluster_df, 
             x='Cluster', 
             y='PVtot_correct_avg (kW)', 
             points='all', # Show all data points on the box plot
             title='Box Plot of Ptot_correct by Cluster',
             labels={'Cluster': 'Cluster', 'Ptot_correct (kW)': 'Ptot_correct (kW)'})
fig.update_layout(
    xaxis=dict(title='Cluster'),
    yaxis=dict(title='Ptot_correct (kW)'),
    boxmode='group'  # Group box plots by clusters
)
fig.show()

fig = px.box(cluster_df, 
             x='Cluster', 
             y='Irradiance_avg (W/m2)', 
             points='all', # Show all data points on the box plot
             title='Box Plot of Irradiance by Cluster',
             labels={'Cluster': 'Cluster', 'Irradiance (W/m2)': 'Irradiance (W/m2)'})
fig.update_layout(
    xaxis=dict(title='Cluster'),
    yaxis=dict(title='Irradiance (W/m2)'),
    boxmode='group'  # Group box plots by clusters
)
fig.show()