In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import joblib
import matplotlib.pyplot as plt
%matplotlib inline

## Load and prepare datasets

In [2]:
def get_metadata(df):
    """Extract metadata columns (city, latitude, longitude) from the dataframe."""
    return df[['city', 'latitude', 'longitude']]

def get_climate_features(df):
    """Extract climate feature columns by dropping metadata columns."""
    return df.drop(columns=['city', 'latitude', 'longitude'])

In [3]:
# Load the datasets from CSV files

period_names = ['1970-1979', '2010-2020', '2041-2050_future_ssp585', '2041-2050_future_ssp370', '2041-2050_future_ssp126']
datasets = {}

for period_name in period_names:
    datasets[period_name] = pd.read_csv(f"datasets/climate_features_{period_name}.csv")
    print(f"Loaded climate_features_{period_name}.csv with shape {datasets[period_name].shape}")

Loaded climate_features_1970-1979.csv with shape (292, 27)
Loaded climate_features_2010-2020.csv with shape (292, 27)
Loaded climate_features_2041-2050_future_ssp585.csv with shape (292, 27)
Loaded climate_features_2041-2050_future_ssp370.csv with shape (292, 27)
Loaded climate_features_2041-2050_future_ssp126.csv with shape (292, 27)


In [4]:
# Keep only climate features for analysis

climate_ds = {}

for period_name in period_names:
    climate_ds[period_name] = get_climate_features(datasets[period_name])

## Load PCA and scaler models

In [5]:
pca = joblib.load('models/pca_historical.joblib')
scaler = joblib.load('models/scaler_pca_historical.joblib')
feature_columns = joblib.load('models/feature_columns.joblib')

## Apply PCA and scaler to datasets

In [6]:
def find_nb_min_components_for_variance(pca, target_explained_variance=0.9):
    """Find the minimum number of PCA components needed to explain the target variance.
        Arguments:
            pca: Trained PCA object
            target_explained_variance: Target explained variance (default is 0.9)
        Returns: Minimum number of components needed to explain the target variance
    """
    cummulated_variance = np.cumsum(pca.explained_variance_ratio_)
    min_index = np.sum(cummulated_variance < target_explained_variance)

    return min_index + 1

def project_data_pca(df, scaler, pca, n_components):
    
    """Project data using scaler and PCA with specified number of components.
        Arguments:
            df: Input dataframe to project
            scaler: Trained StandardScaler object
            pca: Trained PCA object
            n_components: Number of PCA components to use for projection
        Returns: Function that projects input data using the scaler and PCA
    """

    scaled_data = scaler.transform(df)
    projected_data = pca.transform(scaled_data)[:, :n_components]

    return projected_data

def add_metadata_to_df(df, metadata):
    return pd.concat([metadata, df], axis=1)

In [7]:
TARGET_EXPLAINED_VARIANCE = 0.9

min_components = find_nb_min_components_for_variance(pca, TARGET_EXPLAINED_VARIANCE)
print(f"Minimum number of PCA components to explain {TARGET_EXPLAINED_VARIANCE*100}% variance: {min_components}")

Minimum number of PCA components to explain 90.0% variance: 4


In [8]:
pca_df = {}

for period_name in period_names:
    projected_data = project_data_pca(climate_ds[period_name][feature_columns], scaler, pca, min_components)
    pca_df[period_name] = pd.DataFrame(projected_data, columns=[f'PC{i+1}' for i in range(min_components)])
    metadata = get_metadata(datasets[period_name])
    pca_df[period_name] = add_metadata_to_df(pca_df[period_name], metadata)
    print(f"PCA DataFrame for {period_name} shape: {pca_df[period_name].shape}")

PCA DataFrame for 1970-1979 shape: (292, 7)
PCA DataFrame for 2010-2020 shape: (292, 7)
PCA DataFrame for 2041-2050_future_ssp585 shape: (292, 7)
PCA DataFrame for 2041-2050_future_ssp370 shape: (292, 7)
PCA DataFrame for 2041-2050_future_ssp126 shape: (292, 7)


In [9]:
display(pca_df['2010-2020'].head())

Unnamed: 0,city,latitude,longitude,PC1,PC2,PC3,PC4
0,Aachen,50.776642,6.08342,-0.071734,0.130183,0.616279,-0.310959
1,Aberdeen,57.143688,-2.09814,-1.083642,0.243241,2.213594,0.215005
2,Aix-en-Provence,43.528301,5.44973,-1.97283,0.050951,-0.400779,0.555935
3,Alcalá de Henares,40.482052,-3.35996,-1.807056,-0.593693,-1.693132,0.629092
4,Alicante,38.345169,-0.48149,-2.96156,-0.450912,-1.191013,1.166077


## Load encoder embeddings

Load the embeddings generated by the autoencoder

In [10]:
# Load the saved encoder embeddings
encoder_historical_embeddings = joblib.load('models/encoder_historical_embeddings.joblib')
encoder_future_embeddings = joblib.load('models/encoder_future_embeddings.joblib')

print("Historical embeddings loaded:")
for key in encoder_historical_embeddings.keys():
    print(f"  {key}: shape {encoder_historical_embeddings[key].shape}")

print("\nFuture embeddings loaded:")
for key in encoder_future_embeddings.keys():
    print(f"  {key}: shape {encoder_future_embeddings[key].shape}")

Historical embeddings loaded:
  1970-1979: shape (292, 6)
  1980-1989: shape (292, 6)
  1990-1999: shape (292, 6)
  2000-2009: shape (292, 6)
  2010-2020: shape (292, 6)

Future embeddings loaded:
  2041-2050_ssp126: shape (292, 6)
  2041-2050_ssp370: shape (292, 6)
  2041-2050_ssp585: shape (292, 6)


## Find closest city

In [11]:
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import mahalanobis

In [12]:
def create_distance_matrix(ds_current, ds_target, distance_metric='euclidean'):
    
    distances = [[0 for _ in range(ds_target.shape[0])] for _ in range(ds_current.shape[0])]

    for i in range(ds_current.shape[0]):
        for j in range(ds_target.shape[0]):
            if distance_metric == 'euclidean':
                distances[i][j] = euclidean_distances([ds_current.iloc[i, 3:]], [ds_target.iloc[j, 3:]])[0][0]
            elif distance_metric == 'mahalanobis':
                VI = np.linalg.inv(np.cov(ds_target.iloc[:, 3:].T))
                distances[i][j] = mahalanobis(ds_current.iloc[i, 3:], ds_target.iloc[j, 3:], VI)           
            else:
                raise ValueError("Unsupported distance metric. Use 'euclidean' or 'mahalanobis'.")
    
    return distances

In [13]:
def find_closest_city(
    target_ds,
    current_ds,
    current_city,
    distance_matrix
):
    """
    Distances above the percentile defined by `threshold`
    (computed on all elements of the distance matrix)
    are considered too far.
    """

    distance_matrix = np.asarray(distance_matrix)

    # Find index of the city in current_ds by name
    mask = current_ds['city'] == current_city

    if not mask.any():
        raise ValueError(
            f"City at coordinates {current_city} not found in the current dataset."
        )

    current_index = current_ds[mask].index[0]

    # Closest city
    distances = distance_matrix[current_index]
    closest_index = np.argmin(distances)
    closest_distance = distances[closest_index]

    closest_city = target_ds.iloc[closest_index]['city']
    return closest_city, closest_distance


In [14]:
distance_kinds = ['euclidean', 'mahalanobis']
present = pca_df['2010-2020']
distance_matrices = {}

for period in period_names:
    if period == '2010-2020':
        continue # skip present period
    for distance_kind in distance_kinds:
        print(f"Computing {distance_kind} distance matrix for period {period}...")
        distance_matrices[f'{distance_kind}_{period}'] = create_distance_matrix(pca_df[period], present, distance_metric=distance_kind)
        print('Done')
        

Computing euclidean distance matrix for period 1970-1979...


Done
Computing mahalanobis distance matrix for period 1970-1979...
Done
Computing euclidean distance matrix for period 2041-2050_future_ssp585...
Done
Computing euclidean distance matrix for period 2041-2050_future_ssp585...
Done
Computing mahalanobis distance matrix for period 2041-2050_future_ssp585...
Done
Computing mahalanobis distance matrix for period 2041-2050_future_ssp585...
Done
Computing euclidean distance matrix for period 2041-2050_future_ssp370...
Done
Computing euclidean distance matrix for period 2041-2050_future_ssp370...
Done
Computing mahalanobis distance matrix for period 2041-2050_future_ssp370...
Done
Computing mahalanobis distance matrix for period 2041-2050_future_ssp370...
Done
Computing euclidean distance matrix for period 2041-2050_future_ssp126...
Done
Computing euclidean distance matrix for period 2041-2050_future_ssp126...
Done
Computing mahalanobis distance matrix for period 2041-2050_future_ssp126...
Done
Computing mahalanobis distance matrix for period 

In [15]:
euclidean_distances_matrix = distance_matrices['euclidean_1970-1979']

metadata = get_metadata(datasets['2010-2020'])
names = datasets['2010-2020'][['city']]
cities = pd.DataFrame(columns=['city_1970', 'city_analog_today'])

distances = []

# For each city: find which city has TODAY (2020) the climate that this city WILL HAVE (2050)
for city, lat, long in metadata.values:
    # Find the city's future climate (2050) and match it with current climates (2020)
    closest_city = find_closest_city(
        pca_df['2010-2020'],   # target: search in current climates (2020)
        pca_df['1970-1979'],   # current: the city's future climate (2050)
        city, 
        euclidean_distances_matrix
    )
    new_row = pd.DataFrame([[city, closest_city[0]]], 
                          columns=['city_1970', 'city_analog_today'])
    cities = pd.concat([cities, new_row], ignore_index=True)

In [16]:
display(cities)

Unnamed: 0,city_1970,city_analog_today
0,Aachen,Aachen
1,Aberdeen,Aberdeen
2,Aix-en-Provence,Montpellier
3,Alcalá de Henares,Alcalá de Henares
4,Alicante,Valencia
...,...,...
287,Zagreb,Zagreb
288,Zaragoza,Valladolid
289,Zürich,Bern
290,Århus,Gdynia


In [17]:
# calculate the percentage of cities that remained the same
same_city_count = np.sum(cities['city_1970'] == cities['city_analog_today'])
total_cities = len(cities)

f"Percentage of cities that remained the same: {same_city_count / total_cities * 100:.2f}%"

'Percentage of cities that remained the same: 25.68%'

## Save distances matrices in CSV

In [18]:
def matrix_to_csv(matrix, filename):
    """Save distance matrix to CSV with columns starting at 1."""
    df = pd.DataFrame(matrix)
    df.columns = [str(i+1) for i in range(len(df.columns))]
    df.to_csv(filename, index=False)

In [19]:
for key, matrix in distance_matrices.items():
    matrix_to_csv(matrix, f'distance_matrices/pca_{key}.csv')

## Compute and save distance matrices for encoder embeddings

In [20]:
# Compute distance matrices for historical period (1970-1979)
encoder_distance_matrices = {}
distance_kinds = ['euclidean', 'mahalanobis']

# Use 2010-2020 as the reference period
reference_period = '2010-2020'
encoder_reference = encoder_historical_embeddings[reference_period]

# Compute for historical period 1970-1979
historical_period = '1970-1979'
for distance_kind in distance_kinds:
    print(f"Computing {distance_kind} distance matrix for encoder embeddings {historical_period}...")
    encoder_distance_matrices[f'{distance_kind}_{historical_period}'] = create_distance_matrix(
        encoder_historical_embeddings[historical_period], 
        encoder_reference, 
        distance_metric=distance_kind
    )
    print('Done')

Computing euclidean distance matrix for encoder embeddings 1970-1979...
Done
Computing mahalanobis distance matrix for encoder embeddings 1970-1979...
Done
Computing mahalanobis distance matrix for encoder embeddings 1970-1979...
Done
Done


In [21]:
# Compute for future scenarios (2041-2050 with ssp126, ssp370, ssp585)
for key in encoder_future_embeddings.keys():
    for distance_kind in distance_kinds:
        print(f"Computing {distance_kind} distance matrix for encoder embeddings {key}...")
        encoder_distance_matrices[f'{distance_kind}_{key}'] = create_distance_matrix(
            encoder_future_embeddings[key], 
            encoder_reference, 
            distance_metric=distance_kind
        )
        print('Done')

Computing euclidean distance matrix for encoder embeddings 2041-2050_ssp126...
Done
Computing mahalanobis distance matrix for encoder embeddings 2041-2050_ssp126...
Done
Computing mahalanobis distance matrix for encoder embeddings 2041-2050_ssp126...
Done
Computing euclidean distance matrix for encoder embeddings 2041-2050_ssp370...
Done
Computing euclidean distance matrix for encoder embeddings 2041-2050_ssp370...
Done
Computing mahalanobis distance matrix for encoder embeddings 2041-2050_ssp370...
Done
Computing mahalanobis distance matrix for encoder embeddings 2041-2050_ssp370...
Done
Computing euclidean distance matrix for encoder embeddings 2041-2050_ssp585...
Done
Computing euclidean distance matrix for encoder embeddings 2041-2050_ssp585...
Done
Computing mahalanobis distance matrix for encoder embeddings 2041-2050_ssp585...
Done
Computing mahalanobis distance matrix for encoder embeddings 2041-2050_ssp585...
Done
Done


In [22]:
# Save encoder distance matrices to CSV files
for key, matrix in encoder_distance_matrices.items():
    filename = f'distance_matrices/embedding_{key}.csv'
    matrix_to_csv(matrix, filename)
    print(f"Saved {filename}")

Saved distance_matrices/embedding_euclidean_1970-1979.csv
Saved distance_matrices/embedding_mahalanobis_1970-1979.csv
Saved distance_matrices/embedding_euclidean_2041-2050_ssp126.csv
Saved distance_matrices/embedding_mahalanobis_2041-2050_ssp126.csv
Saved distance_matrices/embedding_euclidean_2041-2050_ssp126.csv
Saved distance_matrices/embedding_mahalanobis_2041-2050_ssp126.csv
Saved distance_matrices/embedding_euclidean_2041-2050_ssp370.csv
Saved distance_matrices/embedding_mahalanobis_2041-2050_ssp370.csv
Saved distance_matrices/embedding_euclidean_2041-2050_ssp370.csv
Saved distance_matrices/embedding_mahalanobis_2041-2050_ssp370.csv
Saved distance_matrices/embedding_euclidean_2041-2050_ssp585.csv
Saved distance_matrices/embedding_mahalanobis_2041-2050_ssp585.csv
Saved distance_matrices/embedding_euclidean_2041-2050_ssp585.csv
Saved distance_matrices/embedding_mahalanobis_2041-2050_ssp585.csv
