In [1]:
import piccard as pc
import piccard2 as pc2

# install other dependencies, pip install first if needed
from tscluster.tsplot import tsplot
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
households_data_2021 = gpd.read_file("https://raw.githubusercontent.com/ecorbin567/piccard2/refs/heads/main/docs/piccard2_testing_data/households_data_2021.geojson")
households_data_2016 = gpd.read_file("https://raw.githubusercontent.com/ecorbin567/piccard2/refs/heads/main/docs/piccard2_testing_data/households_data_2016.geojson")
households_data_2011 = gpd.read_file("https://raw.githubusercontent.com/ecorbin567/piccard2/refs/heads/main/docs/piccard2_testing_data/households_data_2011.geojson")
households_data_2006 = gpd.read_file("https://raw.githubusercontent.com/ecorbin567/piccard2/refs/heads/main/docs/piccard2_testing_data/households_data_2006.geojson")

households_data_2021.rename(columns={'v_CA21_434: Occupied private dwellings by structural type of dwelling data': 'occupied_private_dwellings',
                                     'v_CA21_435: Single-detached house': 'single_detached_house',
                                     'v_CA21_440: Apartment in a building that has five or more storeys': 'apt_five_or_more'}, inplace=True)
households_data_2016.rename(columns={'v_CA16_408: Occupied private dwellings by structural type of dwelling data': 'occupied_private_dwellings',
                                     'v_CA16_409: Single-detached house': 'single_detached_house',
                                     'v_CA16_410: Apartment in a building that has five or more storeys': 'apt_five_or_more'}, inplace=True)
households_data_2011.rename(columns={'v_CA11F_199: Total number of occupied private dwellings by structural type of dwelling': 'occupied_private_dwellings',
                                     'v_CA11F_200: Single-detached house': 'single_detached_house',
                                     'v_CA11F_201: Apartment, building that has five or more storeys': 'apt_five_or_more',}, inplace=True)
households_data_2006.rename(columns={'v_CA06_119: Total number of occupied private dwellings by structural type of dwelling - data': 'occupied_private_dwellings',
                                     'v_CA06_120: Single-detached house': 'single_detached_house',
                                     'v_CA06_124: Apartment, building that has five or more storeys': 'apt_five_or_more',}, inplace=True)

In [3]:
census_dfs = [households_data_2006, households_data_2011, households_data_2016, households_data_2021]
years = ['2006', '2011', '2016', '2021']

network_table = pc.create_network_table(census_dfs, years, 'GeoUID')
G = pc.create_network(census_dfs, years, 'GeoUID', 0.05)

In [4]:
arr, label_dict = pc2.clustering_prep(network_table, 'name', [
    'occupied_private_dwellings_2006', 'single_detached_house_2006', 'apt_five_or_more_2006',
    'occupied_private_dwellings_2011', 'single_detached_house_2011', 'apt_five_or_more_2011',
    'occupied_private_dwellings_2016', 'single_detached_house_2016', 'apt_five_or_more_2016',
    'occupied_private_dwellings_2021', 'single_detached_house_2021', 'apt_five_or_more_2021'])

In [5]:
tsc = pc2.cluster(network_table, G, 'GeoUID', 4, arr=arr, label_dict=label_dict)

Initialization with kmeans++, Sum of Distance: 2378037516.6667, Max Distance: 6859.2583
Iteration 0, Sum of distance: 1783766278.9400, Max distance: 7594.8886, Number of change: 149
Iteration 5, Sum of distance: 1562560998.9736, Max distance: 7594.8886, Number of change: 46
Iteration 10, Sum of distance: 1553885796.4576, Max distance: 7594.8886, Number of change: 0
Converged at iteration 15, Sum of distance: 1553885796.4576, Max distance: 7594.8886


In [None]:
import numpy as np
import plotly.graph_objects as go
import plotly
from typing import Union, List, Tuple
from tscluster.opttscluster import OptTSCluster
from tscluster.greedytscluster import GreedyTSCluster
from itertools import cycle, islice
import pandas as pd

def plot_plotly(
    tsc: Union[OptTSCluster, GreedyTSCluster],
    network_table: pd.DataFrame,
    arr: np.ndarray[np.float64],
    label_dict: dict,
    dynamic_entities_only: bool = True,
    entities_to_show: List[int] | None = None,
    clusters_to_show: List[int] | None = None, 
    clusters_to_exclude: List[int] = [],
    cluster_centres_to_show: List[int] | None = None,
    figsize: Tuple[float, float] | None = None,
    shape_of_subplot: Tuple[int, int] | None = None,
    cluster_labels: List[str] | None = None,
    title_list: List[str] | None = None,
    x_rotation: float | int = 45,
    hover_labels: bool = False,
    ) -> go.Figure:

    # define cluster centres and labels from tsc
    cluster_centres= tsc.cluster_centers_
    labels = tsc.labels_

    # define arrays of shapes of timesteps, entities, features, and cluster centres
    T = arr.shape[0] if arr is not None else cluster_centres.shape[0]
    N = arr.shape[1] if arr is not None else 0
    F = arr.shape[2] if arr is not None else cluster_centres.shape[2]
    K = cluster_centres.shape[1] if cluster_centres is not None else (np.unique(labels).size if labels is not None else 1)

    # set default values
    if entities_to_show is None:
        entities_to_show = label_dict['N'] # show all entities
    
    if clusters_to_show is None:
        clusters_to_show = [i for i in range(K)] # show all clusters
    
    if cluster_centres_to_show is None:
        cluster_centres_to_show = [i for i in range(K)] # show all cluster centres

    if shape_of_subplot is None:
        shape_of_subplot = (F, 1)

    if cluster_labels is None:
        cluster_labels = [i for i in range(K)]

    if title_list is None:
        title_list = [f"Feature {f}" for f in label_dict['F']]

    # set colours
    colors = plotly.colors.qualitative.Plotly
    if K > len(colors):
        colors = list(islice(cycle(colors), K))

    # define subplots for each feature
    fig = plotly.subplots.make_subplots(rows=shape_of_subplot[0], cols=shape_of_subplot[1], subplot_titles=title_list, shared_xaxes=False, vertical_spacing=0.06)
    
    # figure out which entities to show
    entities_to_show = [item for item in entities_to_show if any([cluster in [int(i) for i in list(network_table.iloc[item][-4:])] for cluster in clusters_to_show])]
    entities_to_show = [item for item in entities_to_show if all([cluster not in [int(i) for i in list(network_table.iloc[item][-4:])] for cluster in clusters_to_exclude])]
    if dynamic_entities_only:
        dynamic_entities = [label_dict['N'].index(i) for i in tsc.get_dynamic_entities()[0]]
        entities_to_show = [item for item in entities_to_show if item in dynamic_entities]

    # iterate through features
    for f in range(F):
        row = f + 1
        col = 1
        if arr is not None:
            # iterate through each path
            for i in entities_to_show:
                mode = 'lines+markers' if hover_labels else 'lines'
                # plot lines indicating values
                fig.add_trace(
                    go.Scatter(
                        x=label_dict['T'],
                        y=arr[:, i, f],
                        mode=mode,
                        line=dict(color='black', dash='dot'),
                        showlegend=False
                    ),
                    row=row, col=col
                )
                # plot coloured dots indicating cluster
                if labels is not None:
                    label_i = labels[i] if labels.ndim == 1 else labels[i, 0]
                    fig.add_trace(
                        go.Scatter(
                            x=label_dict['T'],
                            y=arr[:, i, f],
                            mode='markers',
                            marker=dict(color=colors[int(label_i)], size=6),
                            name=f"Path {i}",
                            showlegend=False
                        ),
                        row=row, col=col
                    )
        # plot cluster centres
        if cluster_centres is not None:
            for j in range(K):
                if j in cluster_centres_to_show:
                    mode = 'lines+markers' if hover_labels else 'lines'
                    fig.add_trace(
                        go.Scatter(
                            x=label_dict['T'],
                            y=cluster_centres[:, j, f],
                            mode=mode,
                            line=dict(color=colors[j]),
                            name=f"Cluster {cluster_labels[j]}" if f == 0 else None,
                            showlegend=(f == 0)
                        ),
                        row=row, col=col
                    )
        # add axis labels
        fig.update_xaxes(title_text='Year' if f == F - 1 else "", tickangle=x_rotation, row=row, col=col)
        fig.update_yaxes(title_text='Value', row=row, col=col)
    
    # set default figsize
    if figsize is None:
        figsize = (700, 500 * len(range(F)))
    
    # add title and legend
    fig.update_layout(
        width= figsize[0],
        height= figsize[1],
        title="Clustering Results",
        legend_title="Legend",
        showlegend=True,
    )
    return fig


In [None]:
# %pip install nbformat needed if working in ipynb
# next steps:
# reduce dependency on tsc, arr, label dict if possible
# documentation for new plot function
# change testing notebook
# make sure all four colours always show up on the legend
# maybe three separate legends? how much more time would three separate plots be?
# pass in colours to all plots so they are the same & work with colourblindness
# make other plots more customizable-- titles, x axis rotation, cluster labels, figsize
fig = plot_plotly(
    tsc=tsc,
    network_table=network_table,
    arr=arr,
    label_dict=label_dict,
    clusters_to_show=[3],
    cluster_centres_to_show=[2, 3],
    hover_labels=True,
)
fig.show()