In [None]:
# Standard library imports
import datetime
from datetime import date, timedelta
import os
import sys
from pathlib import Path

# Set up environment variables and system paths
os.environ['USE_PYGEOS'] = '0'
sys.path.append('./utils')

# Third-party imports
import contextily as ctx
import geopandas as gpd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import pysal
import seaborn as sns
import pysda
import scipy
import scipy.stats
from scipy.stats import entropy
from scipy import interpolate
from sklearn.preprocessing import MinMaxScaler
from geofeather import from_geofeather, to_geofeather
from statsmodels.nonparametric.smoothers_lowess import lowess as  sm_lowess
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib import patches as mpatches
from matplotlib_scalebar.scalebar import ScaleBar
from shapely.geometry import Point, Polygon
from shapely.ops import nearest_points
from tableone import TableOne

# Local application/library specific imports
import pyspace
import utils


# Pandas global settings
pd.set_option('display.max_columns', 500)

# Matplotlib global styles
mpl.rcParams['font.family'] = 'Avenir'
plt.rcParams['font.size'] = 18
plt.rcParams['axes.linewidth'] = 2

# Path configurations
data_folder = Path('../data/')
res_folder = Path('../results/')

# DATA IMPORT placeholder
# data_filename = input("Data file's name:")

# Load data

## Municipalities - Load municipalities from the canton of Vaud

In [None]:
# Load communes polygon data
gdf_communes = gpd.read_file(data_folder /'Administrative units'/ 'swissBOUNDARIES3D_1_3_TLM_HOHEITSGEBIET.shp', engine='pyogrio')

# Filter for the Canton of Vaud (VD), excluding lake areas, and reset the index
gdf_communes_vd = (gdf_communes[gdf_communes['KANTONSNUM'] == 22]
               .query("NAME not in ['Lac Léman (VD)', 'Lac de Neuchâtel (VD)', 'Lac de Morat (VD)']")
               .dropna(subset=['geometry'])
               .reset_index(drop=True))
# Set the coordinate reference system (CRS)
gdf_communes_vd = gdf_communes_vd.to_crs(2056)
# Dissolve all municipalities into one for the canton
gdf_communes_vd['dummy'] = 1
gdf_canton_vd = gdf_communes_vd.dissolve(by='dummy', as_index=False)

## Population data
### Load inhabited hectares of the canton of Vaud

In [None]:
# Load population statistics and create points GeoDataFrame
df_statpop = pd.read_csv(data_folder / 'ag-b-00.03-vz2020statpop/STATPOP2020.csv', sep=';')
geometry_points = [Point(xy) for xy in zip(df_statpop['E_KOORD'], df_statpop['N_KOORD'])]
statpop_gdf_point = gpd.GeoDataFrame(df_statpop, crs=2056, geometry=geometry_points)

# Spatial join with communes_vd and filter out non-intersecting points
gdf_statpop_communes_vd_point = gpd.sjoin(statpop_gdf_point, gdf_communes_vd, predicate='intersects').drop('index_right', axis=1)

# Create a square polygon around each point with size 100 x 100
geometry_squares = [Polygon([(x, y), (x, y + 100), (x + 100, y + 100), (x + 100, y)]) for x, y in zip(gdf_statpop_communes_vd_point['E_KOORD'], gdf_statpop_communes_vd_point['N_KOORD'])]
gdf_statpop_communes_vd_ha = gpd.GeoDataFrame(gdf_statpop_communes_vd_point, crs=2056, geometry=geometry_squares)

In [None]:
pop_commune_ha = gdf_statpop_communes_vd_point.groupby('NAME').B20BTOT.sum()

## COVID testing data
### Load CHUV testing data

In [None]:
db_covid_path = Path('/Users/david/Switchdrive/GEOCOVID/')
df_covid = gpd.read_file(db_covid_path/'processed_data'/'COVID_3eme_round_archive'/'s3_20dfiltered_tests_geo.gpkg', engine='pyogrio')
# Load toy COVID data
# df_covid = pd.read_csv('../data/covid_data_example.csv')
# df_covid = gpd.GeoDataFrame(df_covid, crs=4326, geometry=gpd.points_from_xy(df_covid.longitude, df_covid.latitude))
# df_covid = gdf_covid.to_crs(2056)

### Date processing

In [None]:
# Convert 'date_reception' into standard formats and extract useful date components
df_covid['date_reception'] = pd.to_datetime(df_covid['date_reception']).dt.strftime('%Y-%m-%d')
df_covid['month'] = pd.to_datetime(df_covid['date_reception']).dt.strftime('%Y-%m')
df_covid['week'] = pd.to_datetime(df_covid['date_reception']).dt.isocalendar().week
df_covid['year'] = pd.to_datetime(df_covid['date_reception']).dt.isocalendar().year
df_covid['week_str'] = df_covid.apply(
    lambda row: datetime.datetime.strptime(f"{row['year']}-{row['week']}-1", "%Y-%W-%w"),
    axis=1
)

In [None]:
print('Total number of individuals :', df_covid.id_patient_study2.nunique())

In [None]:
print('Total number of tests :', df_covid.shape[0])

### Period definition

In [None]:
# Define time periods for analysis
p1_start = '2020-01-10'
p2_start = '2020-06-30'
p3_start = '2020-12-16'
p4_start = '2021-05-07'
p5_start = '2021-11-28'
p5_end = '2022-04-16'

df_covid.loc[df_covid.date_reception.between(p1_start, p2_start), 'period'] = 'p1'
df_covid.loc[df_covid.date_reception.between(p2_start, p3_start), 'period'] = 'p2'
df_covid.loc[df_covid.date_reception.between(p3_start, p4_start), 'period'] = 'p3'
df_covid.loc[df_covid.date_reception.between(p4_start, p5_start), 'period'] = 'p4'
df_covid.loc[df_covid.date_reception.between(p5_start, p5_end), 'period'] = 'p5'

### Data filtering

In [None]:
def map_data(target_gdf, source_gdf, join_column, columns_to_map):
    for col in columns_to_map:
        target_gdf[col] = target_gdf[join_column].map(source_gdf.set_index(join_column)[col])
    return target_gdf

def process_covid_data(df_covid, gdf_statpop, gdf_communes):
    gdf_covid_ha = gpd.sjoin(
        gdf_statpop[['RELI', 'B20BTOT', 'NAME', 'EINWOHNERZ', 'geometry']],
        df_covid[['id_demande_study2', 'date_reception', 'week', 'week_str', 'month', 'year', 'period', 'geometry']],
        predicate='intersects',
        how='right'
    ).sort_values('B20BTOT', ascending=False)
    
    gdf_covid_ha = gdf_covid_ha.drop_duplicates(subset=['id_demande_study2'], keep='first').drop('index_left', axis=1)
    gdf_covid_ha['RELI'] = gdf_covid_ha.apply(lambda row: utils.apply_min_dist(row, gdf_statpop), axis=1)
    gdf_covid_ha['period'] = gdf_covid_ha['period'].str[1].fillna('0').astype(int)
    
    columns_to_map = ['B20BTOT', 'NAME', 'EINWOHNERZ']
    gdf_covid_ha = map_data(gdf_covid_ha, gdf_statpop, 'RELI', columns_to_map)
    gdf_covid_ha['pop_commune'] = gdf_covid_ha['NAME'].map(pop_commune_ha)
    
    return gdf_covid_ha

# Main processing
df_covid['res_cov_txt'] = df_covid['res_cov_txt'].str.upper()
gdf_cases = df_covid[df_covid.res_cov_txt == 'POSITIVE']
print('Total number of positive tests :', gdf_cases.shape[0])
gdf_controls = df_covid[df_covid.res_cov_txt == 'NEGATIVE']
print('Total number of negative tests :', gdf_controls.shape[0])

# Process COVID data
gdf_covid_ha = process_covid_data(df_covid, gdf_statpop_communes_vd_ha, gdf_communes_vd)
gdf_cases_ha = process_covid_data(gdf_cases, gdf_statpop_communes_vd_ha, gdf_communes_vd)
gdf_controls_ha = process_covid_data(gdf_controls, gdf_statpop_communes_vd_ha, gdf_communes_vd)

# Calculate testing rates
gdf_covid_ha_all_periods = pd.concat([
    gdf_covid_ha,
    gdf_covid_ha.assign(period='full')
])

df_testing_rates = gdf_covid_ha_all_periods.groupby(['RELI', 'period']).agg({
    'id_demande_study2': 'size',
    'B20BTOT': 'first'
}).reset_index()
df_testing_rates.columns = ['RELI', 'period', 'n_tests', 'population']
df_testing_rates['testing_rate'] = (df_testing_rates['n_tests'] / df_testing_rates['population']) * 100

# Save testing rates
df_testing_rates.to_csv('../data/testing_rates.csv', index=False)

# Update main GeoDataFrames with case and control counts
for gdf in [gdf_statpop_communes_vd_point, gdf_statpop_communes_vd_ha]:
    gdf['n_cases'] = gdf['RELI'].map(gdf_cases_ha.groupby('RELI').size()).fillna(0)
    gdf['n_controls'] = gdf['RELI'].map(gdf_controls_ha.groupby('RELI').size()).fillna(0)
    gdf['pop_commune'] = gdf['NAME'].map(pop_commune_ha)

# Final spatial join for COVID communes
gdf_covid_communes = gpd.sjoin(df_covid, gdf_communes_vd[['geometry', 'NAME']], predicate='intersects', how='left')

## MSTDBSCAN results

In [None]:
# Define the directory containing the MSTDBSCAN results
mstdbscan_folder = data_folder / 'MSTDBSCAN'

### Load clusters of the whole period

In [None]:
# Load clusters and points data from saved pickle files
clusters_wholeperiod = pd.read_pickle(mstdbscan_folder / 'clusters_wholeperiod - 200m.pkl')

### Load clusters of the combined periods

In [None]:
df_clusters_seq = gpd.read_parquet(mstdbscan_folder/'combined_clusters.parquet')
df_clusters_seq['mstDate'] = pd.to_datetime(df_clusters_seq['mstDate']) 

### Associate cases to the MSTDBSCAN clusters

In [None]:
df_clusters_seq_wcases = gpd.sjoin(df_clusters_seq, gdf_cases[['date_reception', 'geometry']], predicate='intersects')

### Calculate the number of cases by cluster persistence

In [None]:
# Filter cases that match the MST date
cases_in_clusters_byperiod = df_clusters_seq_wcases[df_clusters_seq_wcases.date_reception == df_clusters_seq_wcases.mstDate]
cases_in_clusters_byperiod['Number of positive tests'] = 1

In [None]:
# Group by clusterID and Strain and sum positive tests
grouped = cases_in_clusters_byperiod.groupby(['clusterID'])
n_cases_by_cluster_by_period = grouped['Number of positive tests'].sum()

In [None]:
# Define a function to calculate the date difference for each group
def date_diff(group):
    max_date = group['mstTime'].max()
    min_date = group['mstTime'].min()
    return (max_date - min_date) + 1

# Apply the date_diff function to each group and store the results in a new dataframe
duration_by_cluster_by_period = grouped.apply(date_diff)

In [None]:
# Convert series to DataFrame and reset index
duration_by_cluster_by_period = pd.DataFrame(duration_by_cluster_by_period, columns=['Cluster persistence (days)']).reset_index()
n_cases_by_cluster_by_period = pd.DataFrame(n_cases_by_cluster_by_period).reset_index()

In [None]:
# Merge dataframes to combine number of cases and duration information
df_combined_by_period = pd.merge(n_cases_by_cluster_by_period, duration_by_cluster_by_period, on=['clusterID'])

### Same procedure but for MSTDBSCAN resulting from the analysis on the whole period

In [None]:
# Perform spatial join between clusters and cases
clusters_cases_join = gpd.sjoin(clusters_wholeperiod, gdf_cases[['date_reception', 'geometry']], predicate='intersects')

# Convert dates to datetime format
clusters_cases_join['mstDate'] = pd.to_datetime(clusters_cases_join['mstDate'])
clusters_cases_join['date_reception'] = pd.to_datetime(clusters_cases_join['date_reception'])

# Filter cases where reception date matches the MST date
cases_in_clusters = clusters_cases_join[clusters_cases_join['date_reception'] == clusters_cases_join['mstDate']]
cases_in_clusters['Number of positive tests'] = 1

In [None]:
# Group data by clusterID and calculate sum of positive tests
n_cases_by_cluster = cases_in_clusters.groupby(['clusterID'])['Number of positive tests'].sum()

# Apply the date_diff function to each group in the cluster data
grouped_clusters = clusters_wholeperiod.groupby(['clusterID'])
duration_by_cluster = grouped_clusters.apply(date_diff, include_groups=False)
duration_by_cluster = pd.DataFrame(duration_by_cluster, columns=['Cluster persistence (days)']).reset_index()


In [None]:
df_combined = pd.merge(duration_by_cluster, n_cases_by_cluster, on = ['clusterID'], how = 'left').fillna(0)

## Prepare data for epidemic curves

In [None]:
# Set 'date_reception' as the datetime index and resample to get weekly counts
gdf_cases['date_reception'] = pd.to_datetime(gdf_cases['date_reception'])
gdf_controls['date_reception'] = pd.to_datetime(gdf_controls['date_reception'])

# Resample cases and controls to get weekly counts
cases_by_week = gdf_cases.set_index('date_reception').resample('W').size().rename('Positive tests').to_frame()
controls_by_week = gdf_controls.set_index('date_reception').resample('W').size().rename('Negative tests').to_frame()

In [None]:
# Merge the weekly counts of positive and total tests, fill missing values with zero
weekly_cases_controls = pd.merge(controls_by_week, cases_by_week, 
                            left_index=True, right_index=True, how='left').fillna(0)

# Calculate the positive rate as a percentage
weekly_cases_controls['Total tests'] = weekly_cases_controls['Positive tests'] + weekly_cases_controls['Negative tests']
weekly_cases_controls['Positive rate'] = (weekly_cases_controls['Positive tests'] / weekly_cases_controls['Total tests'] * 100).round(1)
weekly_cases_controls.index = pd.to_datetime(weekly_cases_controls.index)

## Graphical abstract

In [None]:
ax = clusters_wholeperiod.sort_values(['clusterID','mstTime']).drop_duplicates(subset = 'clusterID', keep = 'last').plot(color = 'red', alpha = 0.2, figsize = (10,10), zorder = 3)
gdf_canton_vd.geometry.boundary.plot(ax = ax, linewidth = 0.9, edgecolor = 'darkgrey')
ax.set_axis_off()
df_covid.plot(markersize = 0.5,color = 'black',alpha = 0.1, ax = ax,zorder = 1)
scalebar = ScaleBar(1, units="m", location="lower right")
ax.add_artist(scalebar)

plt.savefig('../figures/Graphical abstract - Part A.png', dpi=300, bbox_inches='tight')

## Figure 2

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

# Assuming controls_by_week is correctly defined and includes 'Positive tests' and 'Total tests'
weekly_cases_controls[['Positive tests', 'Total tests']].plot(legend=True, ax=ax)

# Add vertical lines and labels for different periods
periods = [('2020-03-02', '2020-03-10', 'First period'),
           ('2020-06-30', '2020-07-18', 'Second period'),
           ('2020-12-15', '2020-12-31', 'Third period'),
           ('2021-05-07', '2021-06-12', 'Fourth period'),
           ('2021-11-28', '2021-12-15', 'Fifth period')]

for start, label_pos, label in periods:
    ax.axvline(pd.to_datetime(start), color='black', linestyle=':', lw=2)
    ax.text(pd.to_datetime(label_pos), 9500, label, color='#636363', fontsize=15)

# Set labels and ticks
ax.set_ylabel('Weekly count', labelpad=10, fontsize=14)
ax.set_xlabel('Weeks', labelpad=10, fontsize=14)

# X-ticks and grid settings
xtick = pd.date_range(start=weekly_cases_controls.index.min(), end=weekly_cases_controls.index.max(), freq='W')
ax.set_xticks(xtick, minor=True)
ax.grid(True, which='minor', axis='x')
ax.grid(True, which='major', axis='x')
ax.set_ylim([0, 10000])

# Save the figure
plt.savefig('../figures/Figure 2.png', dpi=300, bbox_inches='tight')
plt.show()


## Figure 3

In [None]:
xmin, ymin, xmax, ymax = gdf_communes[gdf_communes.NAME.isin(['Lausanne','Renens (VD)','Prilly','Ecublens (VD)','Pully','Le Mont-sur-Lausanne','Paudex','Lutry'])].total_bounds
gdf_statpop_communes_laus_point = gdf_statpop_communes_vd_point.cx[xmin:xmax, ymin:ymax]

In [None]:
# df_covid_communes_p1 = df_covid_communes[df_covid_communes.period == 'p1']
# df_covid_communes_p2 = df_covid_communes[df_covid_communes.period == 'p2']
# df_covid_communes_p3 = df_covid_communes[df_covid_communes.period == 'p3']
# df_covid_communes_p4 = df_covid_communes[df_covid_communes.period == 'p4']
# df_covid_communes_p5 = df_covid_communes[df_covid_communes.period == 'p5']

In [None]:
df_lausanne_cases = gdf_statpop_communes_laus_point[['RELI','NAME','pop_commune','B20BTOT']].drop_duplicates()
df_lausanne_controls = gdf_statpop_communes_laus_point[['RELI','NAME','pop_commune','B20BTOT']].drop_duplicates()

In [None]:
start_date = date(2020, 3, 2)
end_date = date(2022, 4, 15)

delta = timedelta(days=1)
while start_date <= end_date:
    # print(str(start_date))
    _df_cases = gdf_cases_ha[gdf_cases_ha.date_reception == str(start_date)]
    _df_controls = gdf_cases_ha[gdf_cases_ha.date_reception == str(start_date)]
    n_cases_ha = _df_cases.groupby('RELI').size()
    n_controls_ha = _df_controls.groupby('RELI').size()
    try:
        df_lausanne_cases[start_date] =  df_lausanne_cases['RELI'].map(n_cases_ha).fillna(0)
        df_lausanne_controls[start_date] =  df_lausanne_controls['RELI'].map(n_controls_ha).fillna(0)
    except:
        df_lausanne_cases[start_date] =  0
        df_lausanne_controls[start_date] =  0
    start_date += delta

In [None]:
df_lausanne_cases_stacked = (
    df_lausanne_cases.set_index('RELI').filter(regex='2020|2021|2022', axis=1)
    .stack()
    .reset_index(name='cases')
)

In [None]:
# Stack and reformat cases DataFrame
df_lausanne_cases_stacked = (
    df_lausanne_cases.set_index('RELI').filter(regex='2020|2021|2022', axis=1)
    .stack()
    .reset_index(name='cases')
)
df_lausanne_cases_stacked.columns = ['RELI', 'day', 'cases']
df_lausanne_cases_stacked['day'] = pd.to_datetime(df_lausanne_cases_stacked['day']).dt.strftime('%Y-%m-%d')
df_lausanne_cases_stacked['cases'] = df_lausanne_cases_stacked['cases'].astype(int)

# Stack and reformat controls DataFrame
df_lausanne_controls_stacked = (
    df_lausanne_controls.set_index('RELI').filter(regex='2020|2021|2022', axis=1)
    .stack()
    .reset_index(name='controls')
)
df_lausanne_controls_stacked.columns = ['RELI', 'day', 'controls']
df_lausanne_controls_stacked['day'] = pd.to_datetime(df_lausanne_controls_stacked['day']).dt.strftime('%Y-%m-%d')
df_lausanne_controls_stacked['controls'] = df_lausanne_controls_stacked['controls'].astype(int)

# Extract longitude and latitude from statpop_communes_vd_point
geometry_crs = gdf_statpop_communes_vd_point.to_crs(epsg=4326)
reli_lon = geometry_crs.set_index('RELI')['geometry'].x.to_dict()
reli_lat = geometry_crs.set_index('RELI')['geometry'].y.to_dict()

# Map coordinates to stacked dataframes
df_lausanne_cases_stacked['lon'] = df_lausanne_cases_stacked['RELI'].map(reli_lon)
df_lausanne_cases_stacked['lat'] = df_lausanne_cases_stacked['RELI'].map(reli_lat)
df_lausanne_controls_stacked['lon'] = df_lausanne_controls_stacked['RELI'].map(reli_lon)
df_lausanne_controls_stacked['lat'] = df_lausanne_controls_stacked['RELI'].map(reli_lat)

# Merge the cases and controls dataframes
df_lausanne_daily = pd.merge(
    df_lausanne_controls_stacked, 
    df_lausanne_cases_stacked, 
    on=['RELI', 'day', 'lon', 'lat'], 
    how='outer'
)

In [None]:
df_lausanne_daily['total'] = df_lausanne_daily[['cases','controls']].sum(axis = 1)

In [None]:
df_lausanne_daily_p1 = df_lausanne_daily[df_lausanne_daily.day.between(p1_start, p2_start)]
df_lausanne_daily_p2 = df_lausanne_daily[df_lausanne_daily.day.between(p2_start, p3_start)]
df_lausanne_daily_p3 = df_lausanne_daily[df_lausanne_daily.day.between(p3_start, p4_start)]
df_lausanne_daily_p4 = df_lausanne_daily[df_lausanne_daily.day.between(p4_start, p5_start)]
df_lausanne_daily_p5 = df_lausanne_daily[df_lausanne_daily.day.between(p5_start, p5_end)]

In [None]:
gdf_covid_communes['time_since_start_epidemic'] = (pd.to_datetime(gdf_covid_communes['date_reception']) - pd.to_datetime('2020-01-10')).dt.days
gdf_covid_communes['time_since_start_epidemic'] = pd.Categorical(gdf_covid_communes['time_since_start_epidemic'])

In [None]:
gdf_covid_communes_positive = gdf_covid_communes[gdf_covid_communes.res_cov_txt == 'POSITIVE']

In [None]:
df_peakedness_commune = pd.DataFrame(gdf_covid_communes_positive.groupby(['NAME','time_since_start_epidemic']).size().div(gdf_covid_communes_positive.groupby(['NAME']).size(), axis = 0), columns = ['frac_cases']).reset_index()
df_peakedness_commune['time_since_start_epidemic'] = df_peakedness_commune['time_since_start_epidemic'].astype(int)

In [None]:
df_peakedness_commune_period = pd.DataFrame(gdf_covid_communes_positive.groupby(['NAME','period','time_since_start_epidemic']).size().div(gdf_covid_communes_positive.groupby(['NAME','period']).size(), axis = 0), columns = ['frac_cases']).reset_index()
df_peakedness_commune_period['time_since_start_epidemic'] = df_peakedness_commune_period['time_since_start_epidemic'].astype(int)

In [None]:
def plot_epidemic_density(df_peakedness_commune, city_name, x_text, y_text):
    fig, ax = plt.subplots(figsize=(8, 4))

    x = df_peakedness_commune[df_peakedness_commune.NAME == city_name]['time_since_start_epidemic']
    y = df_peakedness_commune[df_peakedness_commune.NAME == city_name]['frac_cases']
    sm_x, sm_y = sm_lowess(y, x, frac=0.05, it=15, return_sorted=True).T
    plt.plot(sm_x, sm_y, color='#2b8cbe', linewidth=2)
    ax.axvline(52, color='black', linestyle=':', lw=2)

    ax.axvline(172, color='black', linestyle=':', lw=2)
    ax.text(55, max(sm_y)*1.2, 'First period', color='#636363', fontsize=10)
    
    ax.axvline(341, color='black', linestyle=':', lw=2)
    ax.text(180, max(sm_y)*1.2, 'Second period', color='#636363', fontsize=10)

    ax.axvline(483, color='black', linestyle=':', lw=2)
    ax.text(350, max(sm_y)*1.2, 'Third period', color='#636363', fontsize=10)

    ax.axvline(688, color='black', linestyle=':', lw=2)
    ax.text(520, max(sm_y)*1.2, 'Fourth period', color='#636363', fontsize=10)
    
    ax.axvline(827, color='black', linestyle=':', lw=2)
    ax.text(700, max(sm_y)*1.2, 'Fifth period', color='#636363', fontsize=10)
    ax.set_xlabel('Time since the start of the epidemic (days)')
    ax.set_ylabel('Density')
    ax.set_ylim(0, max(sm_y)*1.5)
    ax.text(x_text, max(sm_y)*1.6, city_name, fontsize=12, color='black', ha='center', va='center')

    plt.savefig(f'../figures/Figure 3A - Epidemic_density_{city_name}.png', dpi = 300, bbox_inches = 'tight')

In [None]:
plot_epidemic_density(df_peakedness_commune, 'Nyon',750, 0.014)
plot_epidemic_density(df_peakedness_commune, 'Yverdon-les-Bains',720, 0.012)
plot_epidemic_density(df_peakedness_commune, 'Montreux',80, 0.02)
plot_epidemic_density(df_peakedness_commune, 'Lausanne',80, 0.006)

### Inverse Shannon entropy

In [None]:
dict_entropy = {}
for name in df_peakedness_commune.NAME.unique():
    _peakedness = df_peakedness_commune[df_peakedness_commune.NAME == name]
    entropy_value = entropy(_peakedness['frac_cases'], base=10)**-1
    dict_entropy[name] = entropy_value

In [None]:
dict_cases_commune = gdf_covid_communes_positive.groupby('NAME').id_demande_study2.nunique().to_dict()

In [None]:
gdf_communes_vd['inv_shannon_entropy'] = gdf_communes_vd['NAME'].map(dict_entropy)
gdf_communes_vd.replace([np.inf, -np.inf], np.nan, inplace=True)
gdf_communes_vd['n_cases'] = gdf_communes_vd['NAME'].map(dict_cases_commune)
gdf_communes_vd['total_attack_rate'] = gdf_communes_vd['n_cases'].div(gdf_communes_vd['EINWOHNERZ'])

In [None]:
g = sns.regplot(data = gdf_communes_vd, lowess = True, x = 'inv_shannon_entropy', y = 'total_attack_rate')
g.set(xscale="log")
g.set(yscale="log")
g.set_xlabel('Inverse Shannon entropy (log10)', size=12)
g.set_ylabel('Final total attack rate (log10)', size=12)

In [None]:
df_attack_rate_commune_period = pd.DataFrame(gdf_covid_communes_positive.groupby(['NAME','period']).size(), columns = ['n_cases']).reset_index()
df_attack_rate_commune_period = pd.merge(df_attack_rate_commune_period, gdf_communes_vd[['NAME','EINWOHNERZ']], on='NAME', how = 'left')
df_attack_rate_commune_period['total_attack_rate'] = df_attack_rate_commune_period['n_cases'].div(df_attack_rate_commune_period['EINWOHNERZ'])

In [None]:
def calculate_entropy_by_period(group):
    return entropy(group['frac_cases'], base=10)**-1

# Group by 'NAME' and 'period'
grouped = df_peakedness_commune_period.groupby(['NAME', 'period'])

# Calculate entropy for each group and store results in a new DataFrame
df_entropy_by_period = grouped.apply(calculate_entropy_by_period).reset_index()
df_entropy_by_period.columns = ['NAME', 'period', 'inverse_shannon_entropy']
df_entropy_by_period.replace([np.inf, -np.inf], np.nan, inplace=True)

# Convert the result DataFrame to a nested dictionary
# dict_entropy = result_df.pivot(index='NAME', columns='period', values='entropy_value').to_dict(orient='index')

In [None]:
df_entropy_and_attack_rate = pd.merge(df_entropy_by_period, df_attack_rate_commune_period[['NAME','period','n_cases','total_attack_rate']], on=['NAME','period'], how = 'left')

In [None]:
scaler = MinMaxScaler()
df_entropy_and_attack_rate['log_shannon'] =  np.log10(df_entropy_and_attack_rate['inverse_shannon_entropy'])
df_entropy_and_attack_rate['log_attack_rate'] =  np.log10(df_entropy_and_attack_rate['total_attack_rate'])
df_entropy_and_attack_rate[['inv_shannon_entropy_scaled']] = scaler.fit_transform(df_entropy_and_attack_rate[['inverse_shannon_entropy']])

In [None]:
df_entropy_and_attack_rate.groupby('period').inv_shannon_entropy_scaled.agg(['mean','std']).round(2).T

In [None]:
for p in df_entropy_and_attack_rate.period.unique():
    _ = df_entropy_and_attack_rate[df_entropy_and_attack_rate.period == p]
    print(p, _[['inv_shannon_entropy_scaled','log_attack_rate']].corr())

In [None]:
columns = ['inv_shannon_entropy_scaled']
mytable = TableOne(df_entropy_and_attack_rate, columns, groupby='period', 
                   pval=True,
                   htest_name=True, 
                   missing=True,
                  decimals=2,
                  normal_test=True,
                  pval_adjust='bonferroni', 
                  nonnormal=columns,
                  display_all=True,
                  label_suffix = True)
mytable

In [None]:
g = sns.lmplot(data=df_entropy_and_attack_rate, lowess=True, hue='period', x='log_shannon', y='log_attack_rate', scatter_kws={'alpha': 0.1})

# Remove the default legend generated by Seaborn
g._legend.remove()

# Add a custom legend only for the lines
line_labels = df_entropy_and_attack_rate['period'].str.replace('p','P').to_list()
lines = g.axes[0][0].get_lines()
legend = plt.legend(handles=lines, labels=line_labels, loc='upper right', title='Period')

g.set_axis_labels("Inverse Shannon entropy (log10)", "Total attack rate (log10)")
plt.savefig('../figures/Figure 3B.png', dpi=300, bbox_inches='tight')

### Lloyd's mean crowding

In [None]:
# Define highlighted communes
highlight_communes = ['Lausanne', 'Nyon', 'Montreux', 'Yverdon-les-Bains']

# Filter and select only relevant communes ensuring no null population sizes
communes_four_main = gdf_communes_vd[gdf_communes_vd['NAME'].isin(highlight_communes) & gdf_communes_vd['EINWOHNERZ'].notnull()]

# Calculate centroids for these communes directly, ensuring we don't modify the original DataFrame
communes_four_main = communes_four_main.copy()
communes_four_main['geometry'] = communes_four_main['geometry'].centroid

# Extract x and y coordinates from geometry
communes_four_main['x'], communes_four_main['y'] = communes_four_main.geometry.x, communes_four_main.geometry.y

# Manually adjust coordinates for Lausanne to make sure the text is in the right place
communes_four_main.loc[communes_four_main['NAME'] == 'Lausanne', ['x', 'y']] = [2539101, 1154000]

# Recreate the geometry from the adjusted coordinates
communes_four_main['geometry'] = gpd.points_from_xy(communes_four_main['x'], communes_four_main['y'])


In [None]:
# Map population data for each commune
gdf_statpop_communes_vd_ha['pop_commune'] = gdf_statpop_communes_vd_ha['NAME'].map(pop_commune_ha)

# Select relevant columns and remove duplicates
df_lloyd = gdf_statpop_communes_vd_ha[['RELI', 'NAME', 'pop_commune', 'n_cases', 'n_controls', 'B20BTOT', 'geometry']].drop_duplicates()

# Aggregate cases and controls by commune
n_cases_commune = df_lloyd.groupby('NAME')['n_cases'].sum()
n_controls_commune = df_lloyd.groupby('NAME')['n_controls'].sum()

# Map the aggregate cases to health areas, filling missing values with zero
df_lloyd['n_cases_ha'] = df_lloyd['RELI'].map(n_cases_ha).fillna(0)
df_lloyd['n_cases_commune'] = df_lloyd['NAME'].map(n_cases_commune).fillna(0)

# Calculate relative population and cases
df_lloyd['relative_pop'] = df_lloyd['B20BTOT'] / df_lloyd['pop_commune']
df_lloyd['relative_cases'] = df_lloyd['n_cases_ha'] / df_lloyd['n_cases_commune']

# Compute the Hoover index
# df_lloyd['hoover_index'] = (df_lloyd['relative_pop'] - df_lloyd['relative_cases']).abs()


In [None]:
# Group by 'NAME' and calculate the crowding value in a vectorized way
def calculate_crowding(group):
    # Ensure the data needed for computation is passed explicitly
    denominator = group['B20BTOT'].sum()
    numerator = ((group['B20BTOT'] - 1) * group['B20BTOT']).sum()
    return numerator / denominator if denominator != 0 else 0

# Apply the function to each group, ensuring grouping columns are not passed to the function
# This change includes passing 'include_groups=False' to comply with future behavior of pandas
dict_lloyd = df_lloyd.groupby('NAME', as_index=True).apply(calculate_crowding, include_groups=True).to_dict()

In [None]:
# Map calculated crowding values to the geodataframe
gdf_communes_vd['lloyd_mean_crowding'] = gdf_communes_vd['NAME'].map(dict_lloyd)

# Calculate the logarithm of 'lloyd_mean_crowding', handling zero or negative values safely
gdf_communes_vd['log_lloyd_mean_crowding'] = np.log10(gdf_communes_vd['lloyd_mean_crowding'].replace(0, np.nan))
gdf_communes_vd['log_lloyd_mean_crowding'] = gdf_communes_vd['log_lloyd_mean_crowding'].fillna(0)

# Calculate the logarithm of 'inv_shannon_entropy', ensuring non-positive values are handled
gdf_communes_vd['log_inv_shannon_entropy'] = np.log10(gdf_communes_vd['inv_shannon_entropy'].clip(lower=1e-10))

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale 'inv_shannon_entropy' using MinMaxScaler and assign it back to the dataframe
gdf_communes_vd['inv_shannon_entropy_scaled'] = scaler.fit_transform(gdf_communes_vd[['inv_shannon_entropy']])

In [None]:
# Create a figure and an axis with specified size
fig, ax = plt.subplots(figsize=(10, 10))

# Plot the 'lloyd_mean_crowding' on the map using a color scale
gdf_communes_vd.plot(
    'lloyd_mean_crowding', 
    cmap='coolwarm', 
    missing_kwds={'color': 'lightgrey'},  # Color for missing values
    legend=True, 
    legend_kwds={'label': 'Crowding', 'shrink': 0.5}, 
    ax=ax
)

# Overlay highlighted communes with black points
communes_four_main.geometry.plot(markersize=3, color='black', ax=ax)

# Annotate communes with their names
for idx, row in communes_four_main.iterrows():
    ax.annotate(
        text=row['NAME'], 
        xy=row.geometry.centroid.coords[0], 
        ha='left', 
        size=8
    )

# Remove axis for a cleaner look
ax.set_axis_off()

# Save the figure to a file
plt.savefig('../figures/Figure 3C.png', dpi=300, bbox_inches='tight')


## Figure S3

In [None]:
# Define your color map
cmap = 'coolwarm'

# Normalize the color scale based on the minimum and maximum population values
norm = plt.Normalize(vmin=gdf_statpop_communes_vd_ha[gdf_statpop_communes_vd_ha.NAME.isin(highlight_communes)]['B20BTOT'].min(), vmax=gdf_statpop_communes_vd_ha[gdf_statpop_communes_vd_ha.NAME.isin(highlight_communes)]['B20BTOT'].max())

# Create a 2x2 grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))

# Flatten the axes to make it easier to iterate over them
axes = axes.flatten()

# Iterate over the desired periods (replace with your actual periods)
for ax, commune in zip(axes, highlight_communes):
    # Filter the data for the specific period
    df_commune = gdf_statpop_communes_vd_ha[gdf_statpop_communes_vd_ha.NAME == commune]
    
    # Plot the population data
    df_commune.plot(column='B20BTOT', cmap=cmap, norm=norm, missing_kwds={'color': 'lightgrey'}, ax=ax)
    gdf_communes_vd[gdf_communes_vd.NAME==commune].geometry.boundary.plot(ax = ax, linewidth=0.6, color='black')

    # Set the axis title and turn it off
    ax.set_title(f'{commune}')
    ax.set_axis_off()


# Create a single colorbar for all subplots at the bottom center
fig.subplots_adjust(bottom=0.1, top=0.9, left=0.1, right=0.8, wspace=0.2, hspace=0.2)
cbar_ax = fig.add_axes([0.3, 0.95, 0.3, 0.02])  # Position and size of the colorbar axis
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])  # Only needed for an empty array for the scalar mappable
cbar = fig.colorbar(sm, cax=cbar_ax, orientation='horizontal')
cbar.set_label('Population')
# Display the maps
plt.savefig('../figures/Figure S3.png', dpi=300, bbox_inches='tight')

## Figure 4

In [None]:
# Rename columns for better readability
gdf_communes_vd = gdf_communes_vd.rename(columns={
    'log_inv_shannon_entropy': 'Inverse Shannon entropy (log10)',
    'EINWOHNERZ': 'Population size',
    'inv_shannon_entropy_scaled': 'Peakedness'
})

# Calculate area in square kilometers from geometry
gdf_communes_vd['area_sq_km'] = gdf_communes_vd['geometry'].area / 1e6

# Aggregate total area by commune and map it back to the original dataframe
total_area_by_commune = gdf_communes_vd.groupby('NAME')['area_sq_km'].sum()
gdf_communes_vd['total_area'] = gdf_communes_vd['NAME'].map(total_area_by_commune)

# Calculate population density (inhabitants per square kilometer)
gdf_communes_vd['Pop. density (hab/km2)'] = gdf_communes_vd['Population size'] / gdf_communes_vd['total_area']

# Add a direct reference to population size for clarity (may be redundant)
gdf_communes_vd['Pop. (hab)'] = gdf_communes_vd['Population size']


In [None]:
fig, ax = plt.subplots(figsize = (12,12))
g = sns.scatterplot(data = gdf_communes_vd, y = 'Inverse Shannon entropy (log10)', x = 'log_lloyd_mean_crowding',hue='Peakedness', size = 'Pop. density (hab/km2)', sizes=(10, 200), palette = 'RdYlGn_r')
g.grid(True)
g.set_ylabel('Inverse Shannon entropy (log10)', size=12)
g.set_xlabel("Lloyd's mean crowding (log10)", size=12)
plt.savefig('../figures/Figure 4A.png', dpi = 300, bbox_inches = 'tight')

In [None]:
df_entropy_and_attack_rate = pd.merge(df_entropy_and_attack_rate, gdf_communes_vd[['NAME','geometry']], how = 'left', on= 'NAME')
df_entropy_and_attack_rate = df_entropy_and_attack_rate.set_geometry('geometry')

In [None]:
def plot_period_entropy(df, period):
    fig, ax = plt.subplots(figsize=(8, 8))
    norm = plt.Normalize(vmin=0, vmax=1)

    df_period = df[df.period == period]
    
    df_period.plot('inv_shannon_entropy_scaled', cmap='RdYlGn_r',norm=norm, missing_kwds={'color': 'lightgrey'}, legend=True,
                   legend_kwds={'label': 'Epidemic peakedness', 'shrink': 0.5}, ax=ax)
    ax.set_axis_off()
    communes_four_main.geometry.plot(markersize = 2,color='black', ax = ax)
    communes_four_main.apply(lambda x: ax.annotate(text=x.NAME, xy=x.geometry.centroid.coords[0], ha='left', size=6),axis=1)
    plt.savefig(f'../figures/Figure 4B - Period {period}.png', dpi = 300, bbox_inches = 'tight')


# Example usage
plot_period_entropy(df_entropy_and_attack_rate, 'p1')
plot_period_entropy(df_entropy_and_attack_rate, 'p2')
plot_period_entropy(df_entropy_and_attack_rate, 'p3')
plot_period_entropy(df_entropy_and_attack_rate, 'p4')
plot_period_entropy(df_entropy_and_attack_rate, 'p5')

## Figure 5 - Hoover index

In [None]:
# Select relevant columns to form the final DataFrame
df_hoover = gdf_statpop_communes_vd_point[['RELI', 'NAME', 'B20BTOT', 'n_cases', 'n_controls', 'geometry']]

# Group by 'NAME' and calculate sums for cases, controls, and population
grouped_df_hoover = df_hoover.groupby('NAME').agg({
    'n_cases': 'sum',
    'n_controls': 'sum',
    'B20BTOT': 'sum'
})

# Rename columns for clarity after aggregation
grouped_df_hoover.rename(columns={'B20BTOT': 'pop_commune'}, inplace=True)

# Map the aggregated data back to the original DataFrame
df_hoover = df_hoover.merge(grouped_df_hoover, on='NAME', suffixes=('', '_commune'))

In [None]:
# Calculate relative proportions
df_hoover['relative_pop'] = df_hoover['B20BTOT'] / df_hoover['pop_commune']
df_hoover['relative_cases'] = df_hoover['n_cases'] / df_hoover['n_cases_commune']
df_hoover['relative_controls'] = df_hoover['n_controls'] / df_hoover['n_controls_commune']

# Compute the Hoover index for each entry
df_hoover['hoover_index'] = (df_hoover['relative_pop'] - df_hoover['relative_cases']).abs()

# Sum the Hoover index by commune, multiply by 50, and sort
hoover_index_full_period = df_hoover.groupby('NAME')['hoover_index'].sum().mul(50).sort_values()

# Map the summed and scaled Hoover index back to the GeoDataFrame of communes
gdf_communes_vd['hoover_index_wholeperiod'] = gdf_communes_vd['NAME'].map(hoover_index_full_period)

In [None]:
fig,ax = plt.subplots(figsize = (10,10))
gdf_communes_vd.plot('hoover_index_wholeperiod',cmap = 'Reds',scheme = 'quantiles',legend = True,ax = ax)
ax.set_axis_off()
ax.set_title(label = 'Hoover index by municipality - Full period')

### Hoover index by week

In [None]:
hoover_stats_weekly = []
for week in gdf_cases_ha.week_str.astype(str).sort_values().unique():
    df_cases = gdf_cases_ha[gdf_cases_ha.week_str == week]
    n_cases_commune = df_cases.groupby('NAME').size()
    n_cases_ha = df_cases.groupby('RELI').size()
    df_hoover = gdf_statpop_communes_vd_point[['RELI','NAME','pop_commune','B20BTOT']].drop_duplicates()
    df_hoover['n_cases_ha'] = df_hoover['RELI'].map(n_cases_ha).fillna(0)
    df_hoover['n_cases_commune'] = df_hoover['NAME'].map(n_cases_commune).fillna(0)
    df_hoover['relative_pop'] = df_hoover['B20BTOT']/df_hoover['pop_commune']
    df_hoover['relative_cases'] = df_hoover['n_cases_ha']/df_hoover['n_cases_commune']
    df_hoover['hoover_index'] = df_hoover['relative_pop'] - df_hoover['relative_cases']
    df_hoover['hoover_index'] = df_hoover['hoover_index'].abs()
    hoover_index_full_period = df_hoover.groupby('NAME')['hoover_index'].sum().mul(50)
    gdf_communes_vd['hoover_index'] = gdf_communes_vd['NAME'].map(hoover_index_full_period)
    hoover_stats_weekly.append({
        'Week': week,
        'mean': gdf_communes_vd['hoover_index'].mean(),
        'std': gdf_communes_vd['hoover_index'].std(),
        'min': gdf_communes_vd['hoover_index'].min(),
        'max': gdf_communes_vd['hoover_index'].max(),
        '_05': gdf_communes_vd['hoover_index'].quantile(0.05),
        '_95': gdf_communes_vd['hoover_index'].quantile(0.95)
    })
    
    
    # fig,ax = plt.subplots(figsize = (10,10))
    # gdf_communes_vd.plot('hoover_index',cmap = 'OrRd',legend = True,legend_kwds={'shrink': 0.6, 'label':'Locational Hoover index (%)'}, ax = ax)
    # ctx.add_basemap(ax, source=ctx.providers.Stamen.Terrain,crs = 'EPSG:2056')
    # ax.set_axis_off()
    # ax.set_title(label = 'Hoover index - Week %s' % week)
    # filename = str('Week %s' % week)
    # plt.tight_layout()
    # plt.savefig('../figures'/'Weekly'/filename, dpi = 100, bbox_inches = 'tight')
    # plt.close()

In [None]:
hoover_stats_weekly = pd.DataFrame(hoover_stats_weekly)
hoover_stats_weekly['Week'] = pd.to_datetime(hoover_stats_weekly['Week'])
hoover_stats_weekly = hoover_stats_weekly.rename(columns = {'mean':'Locational Hoover index (%)'})

In [None]:
casebyweek = gdf_cases.groupby('week_str').size()

In [None]:
fig,ax = plt.subplots(figsize = (12,6))
hoover_stats_weekly.set_index('Week')['Locational Hoover index (%)'].plot(grid=True,legend = True,ax = ax)
ax2=ax.twinx()
casebyweek.plot(color = 'Red',legend = True, ax = ax2, label='Number of cases')
ax2.set_ylabel("Number of cases", labelpad=10,fontsize=14)
ax.set_xlabel('Weeks', labelpad=10,fontsize=14)
ax.set_ylabel('Locational Hoover index (%)', labelpad=10,fontsize=14)
plt.xticks(rotation=45)
ax.legend(loc='upper left', fontsize = 12)
ax2.legend(loc='upper right', fontsize = 12)
ax.axvline(pd.to_datetime('2020-03-02'), color='black', linestyle=':', lw=2)
ax.text(pd.to_datetime('2020-03-20'), 80, 'First period', color='#636363', fontsize=15)

ax.axvline(pd.to_datetime('2020-06-30'), color='black', linestyle=':', lw=2)
ax.text(pd.to_datetime('2020-07-25'), 80, 'Second period', color='#636363', fontsize=15)

ax.axvline(pd.to_datetime('2020-12-15'), color='black', linestyle=':', lw=2)
ax.text(pd.to_datetime('2021-01-08'), 80, 'Third period', color='#636363', fontsize=15)
ax.axvline(pd.to_datetime('2021-05-07'), color='black', linestyle=':', lw=2)
ax.text(pd.to_datetime('2021-06-25'), 80, 'Fourth period', color='#636363', fontsize=15)

ax.axvline(pd.to_datetime('2021-11-28'), color='black', linestyle=':', lw=2)
ax.text(pd.to_datetime('2021-12-15'), 80, 'Fifth period', color='#636363', fontsize=15)
# ax.axvline(pd.to_datetime('2020-03-16'), color='black', linestyle=':', lw=2, label='Lockdown period (March 16 - April 27 2020)')
# ax.axvline(pd.to_datetime('2020-04-27'), color='black', linestyle=':', lw=2)
# ax.axvline(pd.to_datetime('2020-11-06'), color='black', linestyle=':', lw=2, label='Second wave (November 6 - April 18 2021)')
# ax.axvline(pd.to_datetime('2021-04-18'), color='black', linestyle=':', lw=2)
ax.set_ylim([0,100])
ax2.set_ylim([0,5000])

xtick = pd.date_range( start=hoover_stats_weekly.Week.min( ), end=hoover_stats_weekly.Week.max( ), freq='W' )
ax.set_xticks( xtick, minor=True )
ax.grid('on', which='minor', axis='x' )
ax.grid('off', which='major', axis='x')
plt.savefig('../figures/Figure 5A', dpi = 300, bbox_inches = 'tight')

### Hoover index by period

In [None]:
# Initialize DataFrame for storing Hoover statistics by period
# stats_hoover_period = pd.DataFrame(columns=['Period', 'mean', 'std', 'min', 'max', '_05', '_95'])
hoover_stats = []
# Loop through unique periods sorted in order
for p in sorted(gdf_cases_ha.period.unique()):
    print(p)  # Optional: Print current processing period for tracking progress
    _cases = gdf_cases_ha[gdf_cases_ha.period == p]

    # Group and count cases by NAME and RELI
    n_cases_commune = _cases.groupby('NAME').size()
    n_cases_ha = _cases.groupby('RELI').size()

    # Merge case data with geographical data
    df_hoover = gdf_statpop_communes_vd_point[['RELI', 'NAME', 'pop_commune', 'B20BTOT']].drop_duplicates()
    df_hoover['n_cases_ha'] = df_hoover['RELI'].map(n_cases_ha).fillna(0)
    df_hoover['n_cases_commune'] = df_hoover['NAME'].map(n_cases_commune).fillna(0)

    # Calculate relative population and cases
    df_hoover['relative_pop'] = df_hoover['B20BTOT'] / df_hoover['pop_commune']
    df_hoover['relative_cases'] = df_hoover['n_cases_ha'] / df_hoover['n_cases_commune']

    # Compute Hoover index
    df_hoover['hoover_index'] = (df_hoover['relative_pop'] - df_hoover['relative_cases']).abs()
    hoover_index_full_period = df_hoover.groupby('NAME')['hoover_index'].sum().mul(50)

    # Map Hoover index back to geographical DataFrame
    gdf_communes_vd['hoover_index'] = gdf_communes_vd['NAME'].map(hoover_index_full_period)

    # Collect statistics
    hoover_stats.append({
        'Period': p,
        'mean': gdf_communes_vd['hoover_index'].mean(),
        'std': gdf_communes_vd['hoover_index'].std(),
        'min': gdf_communes_vd['hoover_index'].min(),
        'max': gdf_communes_vd['hoover_index'].max(),
        '_05': gdf_communes_vd['hoover_index'].quantile(0.05),
        '_95': gdf_communes_vd['hoover_index'].quantile(0.95)
    })
    
    # Visualization of Hoover index
    fig, ax = plt.subplots(figsize=(5, 5))
    norm = plt.Normalize(vmin=0, vmax=100)
    gdf_communes_vd.plot(
        'hoover_index', linewidth=0.1, edgecolor='grey', cmap='Blues',
        norm=norm, missing_kwds={'color': 'lightgrey'}, legend=True,
        legend_kwds={'shrink': 0.6, 'label': 'Locational Hoover index (%)'}, ax=ax
    )

    # Extra details and annotations
    communes_four_main.geometry.plot(markersize=2, color='black', ax=ax)
    communes_four_main.apply(lambda x: ax.annotate(text=x.NAME, xy=x.geometry.centroid.coords[0], ha='left', size=6), axis=1)
    ax.set_axis_off()
    filename = f'Period {p}'
    plt.tight_layout()
    plt.savefig(f'../figures/Figure 5B - {filename}.png', dpi=300, bbox_inches='tight')
# Create a DataFrame from the list of dictionaries containing stats
stats_hoover_period = pd.DataFrame(hoover_stats)

In [None]:
stats_hoover_period = stats_hoover_period.rename(columns = {'mean':'Locational Hoover index (%)'})

## Figure S2

In [None]:
# Setup the figure and axis
fig, ax = plt.subplots(figsize=(6, 6))

# Plotting the 'Positive rate'
weekly_cases_controls.plot(y='Positive rate', ax=ax, legend=False)

# Setting x-axis ticks to weekly intervals
xtick = pd.date_range(start=weekly_cases_controls.index.min(), end=weekly_cases_controls.index.max(), freq='W')
ax.set_xticks(xtick, minor=True)
ax.set_xticklabels([date.strftime('%Y-%m-%d') for date in xtick], rotation=45, ha='right')
ax.grid('on', which='minor', axis='x' )
ax.grid('off', which='major', axis='x')
# Optional: enabling grid for minor x-axis ticks
# ax.grid(True, which='minor', axis='x')

# Setting labels and their properties
ax.set_xlabel('Weeks', labelpad=10, fontsize=14)
ax.set_ylabel('Positive rate (%)', labelpad=10, fontsize=14)

# Save the figure
plt.savefig('../figures/Figure S2.png', dpi=300, bbox_inches='tight')

# Display the plot
# plt.show()


## Figure S4

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
df_combined.plot.scatter('Number of positive tests','Cluster persistence (days)', logx=True, logy=True, legend=True, ax=ax, alpha = 0.5)
ax.scatter([], [], label='MST-DBSCAN clusters', s=20)
ax.legend(fontsize=12)
plt.savefig('../figures/Figure S4.png', dpi=300, bbox_inches='tight')

In [None]:
# Calculate the Pearson correlation coefficient using pandas
pearson_corr = df_combined['Number of positive tests'].corr(df_combined['Cluster persistence (days)'])

# Calculate the Pearson correlation coefficient and the p-value using scipy
r, p_value = scipy.stats.pearsonr(df_combined['Number of positive tests'], df_combined['Cluster persistence (days)'])

print("Pearson correlation coefficient:", pearson_corr)
# print("Pearson correlation coefficient (scipy):", pearson_corr_scipy)
print("Associated p-value:", p_value)


# Define the confidence level, for example, 95%
confidence_level = 0.95
alpha = 1 - confidence_level

# Calculate the Fisher's z transformation
z = np.arctanh(r)

# Calculate the standard error of z
sample_size = len(df_combined)
standard_error = 1 / np.sqrt(sample_size - 3)

# Calculate the CI for the Fisher's z
z_ci_lower = z - scipy.stats.norm.ppf(1 - alpha/2) * standard_error
z_ci_upper = z + scipy.stats.norm.ppf(1 - alpha/2) * standard_error

# Transform the CI back to the Pearson correlation coefficient
r_ci_lower = np.tanh(z_ci_lower)
r_ci_upper = np.tanh(z_ci_upper)

print("Pearson correlation coefficient:", round(r,2))
print("Confidence interval (lower):", round(r_ci_lower,2))
print("Confidence interval (upper):", round(r_ci_upper,2))

## Figure 6

In [None]:
# Aggregate minimum and maximum dates for each cluster
grouped_clusters_dates = clusters_wholeperiod.groupby('clusterID')['mstDate'].agg(['min', 'max']).reset_index()
grouped_clusters_dates['min'] = pd.to_datetime(grouped_clusters_dates['min'])
grouped_clusters_dates['max'] = pd.to_datetime(grouped_clusters_dates['max'])

# Calculate the duration in days for each cluster
grouped_clusters_dates['duration'] = (grouped_clusters_dates['max'] - grouped_clusters_dates['min']).dt.days + 1


In [None]:
# Mark the start and end dates for clusters
clusters_wholeperiod['startDate'] = pd.to_datetime(clusters_wholeperiod['mstDate'])
clusters_wholeperiod['endDate'] = pd.to_datetime(clusters_wholeperiod['mstDate'])

# Identify long-lasting clusters (duration of 30 days or more)
long_clusters = clusters_wholeperiod[clusters_wholeperiod.clusterID.isin(
    grouped_clusters_dates[grouped_clusters_dates.duration >= 30].clusterID
)]

# Simplify geometries for visualization or further analysis
long_clusters['geometry'] = long_clusters['geometry'].simplify(10)

In [None]:
# Update cluster types to more generic terms
long_clusters.loc[long_clusters['type'] == 'Directional growth', 'type'] = 'Growth'
long_clusters.loc[long_clusters['type'] == 'Directional reduction', 'type'] = 'Reduction'
long_clusters.loc[long_clusters['type'] == 'SplitMerge', 'type'] = 'Move'

# Map colors to cluster types
colors_cl = {
    'Emerge': '#fb9a99', 'Growth': '#e31a1c', 'Steady': '#33a02c', 'Merge': '#ffffb3',
    'Move': '#fdb462', 'Split': '#bebada', 'Reduction': '#1f78b4'
}
long_clusters['color'] = long_clusters['type'].map(colors_cl)


In [None]:
# Convert case reception dates to datetime
gdf_cases['date_reception'] = pd.to_datetime(gdf_cases['date_reception'])

# Assign a default color for cases visualization
gdf_cases['color'] = 'red'

In [None]:
# Sort the DataFrame by start_date
grouped_clusters_dates = grouped_clusters_dates.sort_values(by='clusterID', ascending=True)

# Create the plot
fig, ax = plt.subplots(figsize=(12, 10))
row_height = 0.5
for idx, row in grouped_clusters_dates.iterrows():
    start_date = row['min']
    end_date = row['max']
    cluster_id = row['clusterID']
    ax.plot([start_date, end_date],[cluster_id, cluster_id] , lw=0.3, markersize=1.5, marker='o', color='grey', linestyle='-')

for idx, row in grouped_clusters_dates[grouped_clusters_dates.duration >= 30].iterrows():
    start_date = row['min']
    end_date = row['max']
    cluster_id = row['clusterID']
    ax.plot([start_date, end_date],[cluster_id, cluster_id] , lw=0.6, markersize=3, marker='o', color='red', linestyle='-')
    
ax.axvline(pd.to_datetime('2020-03-02'), color='black', linestyle=':', lw=2)
ax.text(pd.to_datetime('2020-03-20'), 3200, 'First period', color='#636363', fontsize=15)

ax.axvline(pd.to_datetime('2020-06-30'), color='black', linestyle=':', lw=2)
ax.text(pd.to_datetime('2020-07-25'), 3200, 'Second period', color='#636363', fontsize=15)

ax.axvline(pd.to_datetime('2020-12-15'), color='black', linestyle=':', lw=2)
ax.text(pd.to_datetime('2021-01-08'), 3200, 'Third period', color='#636363', fontsize=15)
ax.axvline(pd.to_datetime('2021-05-07'), color='black', linestyle=':', lw=2)
ax.text(pd.to_datetime('2021-06-25'), 3200, 'Fourth period', color='#636363', fontsize=15)

ax.axvline(pd.to_datetime('2021-11-28'), color='black', linestyle=':', lw=2)
ax.text(pd.to_datetime('2021-12-15'), 3200, 'Fifth period', color='#636363', fontsize=15)

# Configure the axes and labels
# ax.set_xlabel('Timeline')
ax.set_ylabel('Cluster ID')
xtick = pd.date_range( start=grouped_clusters_dates['min'].min( ), end=grouped_clusters_dates['max'].max( ), freq='W')
ax.set_xticks(xtick, minor=True)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%Y'))
# ax.xaxis.set_major_locator(mdates.DayLocator(interval=90))  # Change the interval for better readability if needed

plt.xticks(rotation=90)
plt.savefig('../figures/Figure 6.png',dpi = 300, bbox_inches = 'tight')


In [None]:
grouped_clusters_dates.loc[grouped_clusters_dates['min'].between('2020-03-02','2020-06-30'), 'periodStart'] = 'p1'
grouped_clusters_dates.loc[grouped_clusters_dates['min'].between('2020-07-01','2020-12-15'), 'periodStart'] = 'p2'
grouped_clusters_dates.loc[grouped_clusters_dates['min'].between('2020-12-16','2021-05-07'), 'periodStart'] = 'p3'
grouped_clusters_dates.loc[grouped_clusters_dates['min'].between('2021-05-08','2021-11-28'), 'periodStart'] = 'p4'
grouped_clusters_dates.loc[grouped_clusters_dates['min'].between('2021-11-29','2022-04-15'), 'periodStart'] = 'p5'

grouped_clusters_dates.loc[grouped_clusters_dates['max'].between('2020-03-02','2020-06-30'), 'periodEnd'] = 'p1'
grouped_clusters_dates.loc[grouped_clusters_dates['max'].between('2020-07-01','2020-12-15'), 'periodEnd'] = 'p2'
grouped_clusters_dates.loc[grouped_clusters_dates['max'].between('2020-12-16','2021-05-07'), 'periodEnd'] = 'p3'
grouped_clusters_dates.loc[grouped_clusters_dates['max'].between('2021-05-08','2021-11-28'), 'periodEnd'] = 'p4'
grouped_clusters_dates.loc[grouped_clusters_dates['max'].between('2021-11-29','2022-04-15'), 'periodEnd'] = 'p5'

grouped_clusters_dates['period'] = grouped_clusters_dates['periodStart']+'-'+grouped_clusters_dates['periodEnd']

In [None]:
grouped_clusters_dates[grouped_clusters_dates.period.isin(['p1-p1','p2-p2','p3-p3','p4-p4','p5-p5'])].groupby('period')['duration'].agg(['median','std']).round(2)

In [None]:
grouped_clusters_dates = pd.merge(grouped_clusters_dates, df_combined, on = 'clusterID', how = 'left')

In [None]:
grouped_clusters_dates[grouped_clusters_dates.period.isin(['p1-p1','p2-p2','p3-p3','p4-p4','p5-p5'])].groupby('period')['Number of positive tests'].agg(['median','std']).round(2)

## Figure S1

In [None]:
# highlight_communes = ['Lausanne','Montreux','Nyon','Yverdon-les-Bains']
# communes_four_main = gdf_communes_vd[(gdf_communes_vd.NAME.isin(highlight_communes))&(gdf_communes_vd['Population size'].isnull() == False)]
# communes_four_main['geometry'] = gdf_communes_vd[(gdf_communes_vd.NAME.isin(highlight_communes))&(gdf_communes_vd['Population size'].isnull() == False)].geometry.centroid

# communes_four_main['x'], communes_four_main['y'] = communes_four_main.geometry.x, communes_four_main.geometry.y

# communes_four_main.loc[9, 'x'] = 2539101
# communes_four_main.loc[9, 'y'] = 1154000

# communes_four_main['geometry'] = gpd.points_from_xy(communes_four_main.x, communes_four_main.y)

In [None]:
df = gpd.read_parquet(data_folder/'df_survival_covariates.geoparquet')

In [None]:
from matplotlib.patheffects import withStroke

fig, ax = plt.subplots(figsize = (8,8))
df[['Persistence_survival','geometry']].plot('Persistence_survival', cmap = 'RdYlGn_r', legend=True, ax = ax, legend_kwds = {'shrink':0.4,'label': 'Cluster persistence (days)'}, zorder=2)
ax.set_axis_off()
gdf_communes_vd.plot(ax = ax, color='lightgrey', facecolor=None, zorder=1)
communes_four_main.geometry.plot(markersize = 4,color='black', ax = ax)
# communes_four_main.apply(lambda x: ax.annotate(text=x.NAME, xy=x.geometry.centroid.coords[0], ha='left', size=8),axis=1)
for x, y, label in zip(communes_four_main.geometry.centroid.x, communes_four_main.geometry.centroid.y, communes_four_main['NAME']):
    ax.text(x, y, label, fontsize=6, ha='right', va='bottom',
            path_effects=[withStroke(linewidth=2, foreground='white')])
 
plt.savefig('../figures/Figure S1.png',dpi=300, transparent=True, bbox_inches='tight')