# Station sampling

I want to sample the stations in a relatively uniform manner so that there are no big spatial biases in the dataset.
This sampling will be used to create different sets of stations (train, val, etc)

In [1]:
!nvidia-smi

Tue Oct 25 14:41:28 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A5000    On   | 00000000:67:00.0 Off |                  Off |
| 30%   34C    P8    14W / 230W |      1MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import dask
import dask.dataframe as dd
import dask_jobqueue
import dask.distributed
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pathlib
import plotly.express as px
import random

## Boot dask

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
)

In [None]:
cluster.scale(jobs=4)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

In [None]:
DATA_DIR = pathlib.Path(os.getenv("DATA_DIR"))
input_path = DATA_DIR / 'interpolated/2021-12-20-gdps-metar/'
parquet_files = sorted(list(input_path.glob('*.parquet')))

In [None]:
sample = pd.read_parquet(next(iter(parquet_files)))

In [None]:
sample_path = next(iter(input_path.))

In [None]:
df = dd.read_parquet(parquet_files, colums=sample.columns)

## Create station index w/ metadata

In [None]:
stations = df.groupby('station').agg({'latitude': 'first', 'longitude': 'first', 'elevation': 'first'}).compute()

In [None]:
stations

In [None]:
stations_path = DATA_DIR / 'interpolated/2021-12-20-gdps-metar/stations_w_metadata.csv'
stations.to_csv(stations_path)

In [None]:
stations = pd.read_csv(stations_path)

## Try and compute distance

Si les $\lambda_i$ et $\phi_i$ sont respectivement les latitudes et longgitudes, on a $\Delta\sigma = \arccos\bigl(\sin\phi_1\sin\phi_2 + \cos\phi_1\cos\phi_2\cos(\Delta\lambda)\bigr).$


La great circle distance serait $d = r \, \Delta\sigma$, mais on a pas vraiment besoin du r

Source: https://en.wikipedia.org/wiki/Great-circle_distance#Formulae


In [None]:
lat = np.deg2rad(stations['latitude'].to_numpy()).reshape(-1, 1)
lon = np.deg2rad(stations['longitude'].to_numpy()).reshape(-1, 1)

lon[lon < 0.0] += 2 * np.pi

In [None]:
lat

In [None]:
lon

In [None]:
cos_delta_lambda = np.cos(lon - lon.T)

In [None]:
cos_delta_lambda

In [None]:
cos_cos_cos = (np.cos(lat) * np.cos(lat.T)) * cos_delta_lambda

In [None]:
sin_sin = np.sin(lat) * np.sin(lat.T)

In [None]:
cos_cos_cos.shape

In [None]:
sin_sin.shape

In [None]:
distances = np.arccos(sin_sin + cos_cos_cos)
np.fill_diagonal(distances, 0.0)

In [None]:
distances

## Furthest point sampling

In [None]:
selected_station_ids = [random.randint(0, distances.shape[0])]
selected_station_ids

In [None]:
N_STATIONS = 200

for _ in range(N_STATIONS):
    distances_to_selected = distances[selected_station_ids]
    distance_to_closest = distances_to_selected.min(axis=0)
    print(np.max(distance_to_closest))
    furthest = np.argmax(distance_to_closest)
    
    selected_station_ids.append(int(furthest))

In [None]:
selected_station_ids

In [None]:
selected_stations = stations.iloc[selected_station_ids]
selected_stations.head()

In [None]:
selected_stations.drop(columns=['to_cyul', 'selected']).to_csv(DATA_DIR / 'bootstrap_set.csv')

In [None]:
mask = np.zeros(len(stations), dtype=bool)

In [None]:
mask[selected_station_ids] = True

In [None]:
not_selected_stations = stations[~mask]

In [None]:
not_selected_stations.drop(columns=['to_cyul', 'selected']).to_csv(DATA_DIR / 'reference_set.csv')

In [None]:
selected_stations.drop(columns=['to_cyul'])[selected_stations['selected']].drop(columns=['selected'])

In [None]:
stations['selected'] = False

In [None]:
stations

In [None]:
stations.iloc[selected_station_ids, 3] = True

In [None]:
px.scatter_geo(data_frame=stations, lat='latitude', lon='longitude', color='selected', scope='north america', height=1000, width=1000)

In [None]:
stations.loc['CYUL']

In [None]:
stations.index.get_loc('CYUL')

In [None]:
distances[437]

In [None]:
stations['to_cyul'] = distances[437]

In [None]:
px.scatter_geo(data_frame=stations, lat='latitude', lon='longitude', color='to_cyul', scope='north america', height=1000, width=1000)

In [None]:
distance_to_closest

In [None]:
stations['to_closest'] = distance_to_closest

In [None]:
px.scatter_geo(data_frame=stations, lat='latitude', lon='longitude', color='to_closest', scope='north america', height=1000, width=1000)