In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import joblib
import matplotlib.pyplot as plt
%matplotlib inline

## Load and prepare datasets

In [2]:
def get_metadata(df):
    """Extract metadata columns (city, latitude, longitude) from the dataframe."""
    return df[['city', 'latitude', 'longitude']]

def get_climate_features(df):
    """Extract climate feature columns by dropping metadata columns."""
    return df.drop(columns=['city', 'latitude', 'longitude'])

In [3]:
# Load the datasets from CSV files

period_names = ['1970-1979', '1980-1989', '1990-1999', '2000-2009', '2010-2020']
datasets = {}

for period_name in period_names:
    datasets[period_name] = pd.read_csv(f"datasets/climate_features_{period_name}.csv")
    print(f"Loaded climate_features_{period_name}.csv with shape {datasets[period_name].shape}")

Loaded climate_features_1970-1979.csv with shape (292, 27)
Loaded climate_features_1980-1989.csv with shape (292, 27)
Loaded climate_features_1990-1999.csv with shape (292, 27)
Loaded climate_features_2000-2009.csv with shape (292, 27)
Loaded climate_features_2010-2020.csv with shape (292, 27)


In [4]:
# Keep only climate features for analysis

climate_ds = {}

for period_name in period_names:
    climate_ds[period_name] = get_climate_features(datasets[period_name])

## Load PCA and scaler models

In [5]:
pca = joblib.load('models/pca_historical.joblib')
scaler = joblib.load('models/scaler_pca_historical.joblib')
feature_columns = joblib.load('models/feature_columns.joblib')

## Apply PCA and scaler to datasets

In [6]:
def find_nb_min_components_for_variance(pca, target_explained_variance=0.9):
    """Find the minimum number of PCA components needed to explain the target variance.
        Arguments:
            pca: Trained PCA object
            target_explained_variance: Target explained variance (default is 0.9)
        Returns: Minimum number of components needed to explain the target variance
    """
    cummulated_variance = np.cumsum(pca.explained_variance_ratio_)
    min_index = np.sum(cummulated_variance < target_explained_variance)

    return min_index + 1

def project_data_pca(df, scaler, pca, n_components):
    
    """Project data using scaler and PCA with specified number of components.
        Arguments:
            df: Input dataframe to project
            scaler: Trained StandardScaler object
            pca: Trained PCA object
            n_components: Number of PCA components to use for projection
        Returns: Function that projects input data using the scaler and PCA
    """

    scaled_data = scaler.transform(df)
    projected_data = pca.transform(scaled_data)[:, :n_components]

    return projected_data

def add_metadata_to_df(df, metadata):
    return pd.concat([metadata, df], axis=1)

In [7]:
TARGET_EXPLAINED_VARIANCE = 0.9

min_components = find_nb_min_components_for_variance(pca, TARGET_EXPLAINED_VARIANCE)
print(f"Minimum number of PCA components to explain {TARGET_EXPLAINED_VARIANCE*100}% variance: {min_components}")

Minimum number of PCA components to explain 90.0% variance: 3


In [8]:
pca_df = {}

for period_name in period_names:
    projected_data = project_data_pca(climate_ds[period_name][feature_columns], scaler, pca, min_components)
    pca_df[period_name] = pd.DataFrame(projected_data, columns=[f'PC{i+1}' for i in range(min_components)])
    metadata = get_metadata(datasets[period_name])
    pca_df[period_name] = add_metadata_to_df(pca_df[period_name], metadata)
    print(f"PCA DataFrame for {period_name} shape: {pca_df[period_name].shape}")

PCA DataFrame for 1970-1979 shape: (292, 6)
PCA DataFrame for 1980-1989 shape: (292, 6)
PCA DataFrame for 1990-1999 shape: (292, 6)
PCA DataFrame for 2000-2009 shape: (292, 6)
PCA DataFrame for 2010-2020 shape: (292, 6)


In [9]:
display(pca_df['2010-2020'].head())

Unnamed: 0,city,latitude,longitude,PC1,PC2,PC3
0,Aachen,50.776642,6.08342,-0.126268,0.146205,0.496767
1,Aberdeen,57.143688,-2.09814,-0.639263,-0.533712,1.603871
2,Aix-en-Provence,43.528301,5.44973,-2.208006,0.39659,-0.366904
3,Alcalá de Henares,40.482052,-3.35996,-2.097754,0.11721,-1.199937
4,Alicante,38.345169,-0.48149,-3.306525,0.221981,-1.022172


## Find closest city

In [10]:
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import mahalanobis

In [11]:
def create_distance_matrix(ds_current, ds_target, distance_metric='euclidean'):
    
    distances = [[0 for _ in range(ds_target.shape[0])] for _ in range(ds_current.shape[0])]

    for i in range(ds_current.shape[0]):
        for j in range(ds_target.shape[0]):
            if distance_metric == 'euclidean':
                distances[i][j] = euclidean_distances([ds_current.iloc[i, 3:]], [ds_target.iloc[j, 3:]])[0][0]
            elif distance_metric == 'mahalanobis':
                VI = np.linalg.inv(np.cov(ds_target.iloc[:, 3:].T))
                distances[i][j] = mahalanobis(ds_current.iloc[i, 3:], ds_target.iloc[j, 3:], VI)           
            else:
                raise ValueError("Unsupported distance metric. Use 'euclidean' or 'mahalanobis'.")
    
    return distances

In [12]:
def find_closest_city(
    target_ds,
    current_ds,
    current_city,
    distance_matrix
):
    """
    Distances above the percentile defined by `threshold`
    (computed on all elements of the distance matrix)
    are considered too far.
    """

    distance_matrix = np.asarray(distance_matrix)

    # Find index of the city in current_ds by name
    mask = current_ds['city'] == current_city

    if not mask.any():
        raise ValueError(
            f"City at coordinates {current_city} not found in the current dataset."
        )

    current_index = current_ds[mask].index[0]

    # Closest city
    distances = distance_matrix[current_index]
    closest_index = np.argmin(distances)
    closest_distance = distances[closest_index]

    closest_city = target_ds.iloc[closest_index]['city']
    return closest_city, closest_distance


In [13]:
bern = (46.916666666666664, 7.466667)

In [14]:
euclidean_distances_matrix = create_distance_matrix(pca_df['1970-1979'], pca_df['2010-2020'], distance_metric='euclidean')

In [15]:
find_closest_city(pca_df['2010-2020'], pca_df['1970-1979'], 'Bern', euclidean_distances_matrix)

('Bern', np.float64(0.7597246664849151))

In [16]:
metadata = get_metadata(datasets['2010-2020'])
names = datasets['2010-2020'][['city']]
cities = pd.DataFrame(columns=['city_1970', 'city_analog_today'])

distances = []

# For each city: find which city has TODAY (2020) the climate that this city WILL HAVE (2050)
for city, lat, long in metadata.values:
    # Find the city's future climate (2050) and match it with current climates (2020)
    closest_city = find_closest_city(
        pca_df['2010-2020'],   # target: search in current climates (2020)
        pca_df['1970-1979'],   # current: the city's future climate (2050)
        city, 
        euclidean_distances_matrix
    )
    new_row = pd.DataFrame([[city, closest_city[0]]], 
                          columns=['city_1970', 'city_analog_today'])
    cities = pd.concat([cities, new_row], ignore_index=True)

In [17]:
display(cities)

Unnamed: 0,city_1970,city_analog_today
0,Aachen,Aachen
1,Aberdeen,Aberdeen
2,Aix-en-Provence,Montpellier
3,Alcalá de Henares,Krasnodar
4,Alicante,Valencia
...,...,...
287,Zagreb,Zagreb
288,Zaragoza,Salamanca
289,Zürich,Bern
290,Århus,Gdynia


In [18]:
# calculate the percentage of cities that remained the same
same_city_count = np.sum(cities['city_1970'] == cities['city_analog_today'])
total_cities = len(cities)

f"Percentage of cities that remained the same: {same_city_count / total_cities * 100:.2f}%"

'Percentage of cities that remained the same: 13.70%'

## Save distances matrices in CSV

In [35]:
def matrix_to_csv(matrix, filename):
    """Save distance matrix to CSV with columns starting at 1."""
    df = pd.DataFrame(matrix)
    df.columns = [str(i+1) for i in range(len(df.columns))]
    df.to_csv(filename, index=False)

In [36]:
matrix_to_csv(euclidean_distances_matrix, 'distances/euclidean_1970-1979_ssp580.csv')