In [1]:
import pandas as pd

from cluster import cluster

In [2]:
def identify_clusters(X_data_df, threshold, method="pearson"):
    cor = X_data_df.corr(method=method)

    clusters = []
    for j, col in enumerate(cor.columns):
        for i, row in enumerate(cor.columns[0:j]):
            if abs(cor.iloc[i, j]) > threshold:
                current_pair = (col, row, cor.iloc[i, j])
                current_pair_added = False
                for _c in clusters:
                    if _c.can_accept(current_pair):
                        _c.update_with(current_pair)
                        current_pair_added = True
                if current_pair_added == False:
                    clusters.append(cluster(pairs=[current_pair]))
    final_clusters = []
    for _cluster in clusters:
        added_to_final = False
        for final_c in final_clusters:
            if _cluster.nodes.intersection(final_c.nodes) != set():
                final_c.merge_with_cluster(_cluster)
                added_to_final = True
        if added_to_final == False:
            final_clusters.append(_cluster)
    for i, _cluster in enumerate(final_clusters):
        _cluster.name = f"cluster_{i}"
    return final_clusters

## Parameters

In [3]:
dpath = "PCEC96_perc_diff_lag_1.csv"
target = "PCEC96_perc_diff_lag_1"

## Data

In [4]:
data = pd.read_csv(dpath)
data["reference_date"] = pd.to_datetime(data["reference_date"])
data = data.set_index("reference_date").sort_index()

y_data = data[target]
X_data = data.drop(columns=target)

## Execution section

In [5]:
final_clusters = identify_clusters(X_data_df=X_data, threshold=0.7)

In [6]:
len(final_clusters)

17

In [7]:
cluster_example = final_clusters[2]  

# Each cluster has the following attributes: name, nodes and pairs (i.e. tuples of nodes with paiwise correlation)
print(f"Name: {cluster_example.name}\n")
print(f"Nodes: {cluster_example.nodes}\n")
print(f"Pairs: {cluster_example.pairs}\n")

Name: cluster_2

Nodes: {'M1REAL_diff_lag_3', 'LNS17100000', 'LNS17500000_perc_diff_lag_3', 'M1SL_diff_lag_3', 'LNS17100000_perc_diff_lag_12', 'M1SL_perc_diff_lag_3', 'LNS17100000_perc_diff_lag_3', 'M1NS_diff_lag_3', 'LNS17100000_diff_lag_3', 'M1REAL_perc_diff_lag_3', 'M1NS_perc_diff_lag_3'}

Pairs: {('M1REAL_perc_diff_lag_3', 'LNS17100000_perc_diff_lag_3', 0.9607073865703695), ('M1SL_diff_lag_3', 'M1NS_perc_diff_lag_3', 0.988662563658455), ('M1REAL_diff_lag_3', 'LNS17500000_perc_diff_lag_3', 0.9241630321302535), ('M1REAL_diff_lag_3', 'M1NS_diff_lag_3', 0.9967204369984529), ('M1REAL_diff_lag_3', 'LNS17100000_perc_diff_lag_3', 0.9460008117186878), ('M1SL_diff_lag_3', 'M1NS_diff_lag_3', 0.9994591807453851), ('M1REAL_perc_diff_lag_3', 'M1NS_perc_diff_lag_3', 0.9992524872497489), ('M1SL_diff_lag_3', 'M1REAL_diff_lag_3', 0.9971017358440378), ('M1NS_perc_diff_lag_3', 'LNS17100000_diff_lag_3', 0.816275809476747), ('M1NS_perc_diff_lag_3', 'LNS17100000_perc_diff_lag_12', 0.8409484472633635), ('