# Sample design: D-optimal

In [182]:
from pathlib import Path
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib as matplotlib
import matplotlib.pyplot as plt
import datetime as dt
import shapely
from shapely.geometry import Point, Polygon, MultiPoint, MultiPolygon
import geopandas as gpd
import xarray as xr

from sklearn import cluster
from sklearn import neighbors
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate
from sklearn import metrics

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.dummy import DummyRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression

import cvxpy as cp

In [20]:
repo_path = Path('/Users/etriesch/dev/ocean-carbon-sampling/')
data_clean_path = repo_path / 'data/clean/'
data_raw_path = repo_path / 'data/raw/'
geo_crs = 'epsg:4326'
proj_crs = '+proj=cea'

## Read in and merge data

In [44]:
# read in chlorophyl chunks
c_path = data_clean_path / 'chlor_a_monthly'
c_files = [f for f in os.listdir(c_path) if f.endswith('.csv.gz')]
c_files.sort()
print('files to read:', len(c_files))
c_raw = pd.DataFrame()
for f in tqdm(c_files):
    c = pd.read_csv(data_clean_path / 'chlor_a_monthly' / f)
    c_raw = pd.concat([c, c_raw])

files to read: 9


100%|█████████████████████████████████████████████| 9/9 [00:36<00:00,  4.11s/it]


In [45]:
# read in temperature chunks
t_path = data_clean_path / 'sst_monthly'
t_files = [f for f in os.listdir(t_path) if f.endswith('.csv.gz')]
t_files.sort()
print('files to read:', len(t_files))
t_raw = pd.DataFrame()
for f in tqdm(t_files):
    t = pd.read_csv(data_clean_path / 'sst_monthly' / f)
    t_raw = pd.concat([t, t_raw])

files to read: 9


100%|█████████████████████████████████████████████| 9/9 [00:34<00:00,  3.89s/it]


In [101]:
# read in pc02
p_raw = pd.read_csv(data_clean_path / 'ship_pc02_monthly.csv.gz')

In [106]:
# merge on x/y values
m = pd.merge(left=p_raw, right=c_raw, how='left', on=['x', 'y', 'year', 'month'], suffixes=('_p', '_c'))
m = pd.merge(left=m, right=t_raw, how='left', on=['x', 'y', 'year', 'month'], suffixes=('', '_t'))
print('Merged data shape:', m.shape)
# drop any missing
m_sub = m.loc[m.chlor_a_cln.notna() & m.sst_cln.notna()]
print('Merged data shape after dropping missing:', m_sub.shape)

Merged data shape: (155726, 29)
Merged data shape after dropping missing: (154701, 29)


In [132]:
# read annual data
t_annual_raw = pd.read_csv(data_clean_path / 'sst_annual.csv')
c_annual_raw = pd.read_csv(data_clean_path / 'chlor_a_annual.csv')

In [133]:
# merge on x/y values
m_sub = pd.merge(left=m_sub, right=t_annual_raw, on=['x', 'y', 'year'], suffixes=('', '_ta'))
m_sub = pd.merge(left=m_sub, right=c_annual_raw, on=['x', 'y', 'year'], suffixes=('', '_ca'))

## Make X, y input datasets

In [134]:
response_col = 'pco2_teq'
y = m_sub[response_col]
X = m_sub.drop(columns=response_col)

## Cross validate each regression approach
See template [here](https://github.com/etrieschman/indigo-soil-imputation/blob/master/notebooks/cross_val_interpolators_v2.ipynb)
* Global mean
* Ocean mean
* KNN
* Linear regression
* Elastic net regression (Lasso vs. Ridge regression)
* AdaBoost regression
* https://scikit-learn.org/stable/modules/linear_model.html#polynomial-regression-extending-linear-models-with-basis-functions

In [135]:
def bias(y_true, y_pred):
    """Return the estimated bias of the predictions
    (the average of prediction - truth)"""
    return (y_pred - y_true).mean()

In [136]:
scoring = dict(
    neg_mean_squared_error='neg_mean_squared_error',
    bias=metrics.make_scorer(bias),
    r_squared='r2'
)

In [238]:
# x_in = X.loc[:, ('pacific', 'month', 'year', 'chlor_a_cln', 'sst_cln', 'months_below_mean', 'max', 'std', 'max_ca', 'std_ca')]
x_in = X.loc[:, ('pacific', 'month', 'chlor_a_cln', 'sst_cln')]

cv_mean = cross_validate(DecisionTreeRegressor(), X=x_in, y=y, 
    scoring=scoring, cv=10)
pd.DataFrame(cv_mean).describe()

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_bias,test_r_squared
count,10.0,10.0,10.0,10.0,10.0
mean,0.523331,0.011266,-1158.623183,-0.291505,0.067389
std,0.016804,0.001059,1005.926409,5.418074,0.461708
min,0.496693,0.009504,-3474.884438,-11.722289,-0.872958
25%,0.515597,0.010598,-1221.999446,-3.024189,-0.22735
50%,0.519326,0.011301,-968.591583,0.101861,0.202001
75%,0.528154,0.011531,-520.815382,3.796112,0.424981
max,0.551513,0.013322,-154.207586,6.142043,0.520801


## Summarize bias, variance, MSE

# d-optimal design

In [239]:
from sklearn.preprocessing import KBinsDiscretizer

In [255]:
# discretize variables (this is the entire possibility of experiements)
disc_eq_width = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc_eq_freq = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
disc_kmeans = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='kmeans')

# discretize input dataset
x_disc = pd.DataFrame(disc_eq_width.fit_transform(x_in)).drop_duplicates()
x_disc.columns = x_in.columns
x_disc_mat = x_disc.to_numpy()

In [256]:
# set up convex problem
x_disc_mat = x_disc.to_numpy()
p = x_disc_mat.shape[0]
I = np.identity(p)

# array of x_i @ x_i.T
xxT = [np.outer(x_disc_mat[i,], x_disc_mat[i,]) for i in range(p)]

# variable
l = cp.Variable(p)

# constraints
constraints = [l >= 0, cp.sum(l) == 1]

# objective
cost = [l[i] * xxT[i] for i in range(p)]
obj = cp.Minimize(-cp.log_det(cp.sum(cost)))

# problem
prob = cp.Problem(obj, constraints)
prob.solve()
print('status:', prob.status)
print('optimal value:', prob.value)

status: optimal
optimal value: -13.620997508420519


In [257]:
pd.Series(l.value).describe()

count    8.140000e+02
mean     1.228501e-03
std      1.384035e-02
min     -7.420027e-07
25%     -2.919113e-07
50%     -1.480389e-07
75%      4.428941e-08
max      2.022624e-01
dtype: float64

In [258]:
pd.Series(np.round(200*l.value)).value_counts()

0.0     806
40.0      2
20.0      1
26.0      1
23.0      1
5.0       1
9.0       1
37.0      1
dtype: int64