# Sample design: D-optimal

In [128]:
from pathlib import Path
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib as matplotlib
import matplotlib.pyplot as plt
import datetime as dt
import shapely
from shapely.geometry import Point, Polygon, MultiPoint, MultiPolygon
import geopandas as gpd
import xarray as xr

from sklearn import cluster
from sklearn import neighbors
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate
from sklearn import metrics

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.dummy import DummyRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression

In [20]:
repo_path = Path('/Users/etriesch/dev/ocean-carbon-sampling/')
data_clean_path = repo_path / 'data/clean/'
data_raw_path = repo_path / 'data/raw/'
geo_crs = 'epsg:4326'
proj_crs = '+proj=cea'

## Read in and merge data

In [44]:
# read in chlorophyl chunks
c_path = data_clean_path / 'chlor_a_monthly'
c_files = [f for f in os.listdir(c_path) if f.endswith('.csv.gz')]
c_files.sort()
print('files to read:', len(c_files))
c_raw = pd.DataFrame()
for f in tqdm(c_files):
    c = pd.read_csv(data_clean_path / 'chlor_a_monthly' / f)
    c_raw = pd.concat([c, c_raw])

files to read: 9


100%|█████████████████████████████████████████████| 9/9 [00:36<00:00,  4.11s/it]


In [45]:
# read in temperature chunks
t_path = data_clean_path / 'sst_monthly'
t_files = [f for f in os.listdir(t_path) if f.endswith('.csv.gz')]
t_files.sort()
print('files to read:', len(t_files))
t_raw = pd.DataFrame()
for f in tqdm(t_files):
    t = pd.read_csv(data_clean_path / 'sst_monthly' / f)
    t_raw = pd.concat([t, t_raw])

files to read: 9


100%|█████████████████████████████████████████████| 9/9 [00:34<00:00,  3.89s/it]


In [101]:
# read in pc02
p_raw = pd.read_csv(data_clean_path / 'ship_pc02_monthly.csv.gz')

In [106]:
# merge on x/y values
m = pd.merge(left=p_raw, right=c_raw, how='left', on=['x', 'y', 'year', 'month'], suffixes=('_p', '_c'))
m = pd.merge(left=m, right=t_raw, how='left', on=['x', 'y', 'year', 'month'], suffixes=('', '_t'))
print('Merged data shape:', m.shape)
# drop any missing
m_sub = m.loc[m.chlor_a_cln.notna() & m.sst_cln.notna()]
print('Merged data shape after dropping missing:', m_sub.shape)

Merged data shape: (155726, 29)
Merged data shape after dropping missing: (154701, 29)


In [132]:
# read annual data
t_annual_raw = pd.read_csv(data_clean_path / 'sst_annual.csv')
c_annual_raw = pd.read_csv(data_clean_path / 'chlor_a_annual.csv')

In [133]:
# merge on x/y values
m_sub = pd.merge(left=m_sub, right=t_annual_raw, on=['x', 'y', 'year'], suffixes=('', '_ta'))
m_sub = pd.merge(left=m_sub, right=c_annual_raw, on=['x', 'y', 'year'], suffixes=('', '_ca'))

## Make X, y input datasets

In [134]:
response_col = 'pco2_teq'
y = m_sub[response_col]
X = m_sub.drop(columns=response_col)

## Cross validate each regression approach
See template [here](https://github.com/etrieschman/indigo-soil-imputation/blob/master/notebooks/cross_val_interpolators_v2.ipynb)
* Global mean
* Ocean mean
* KNN
* Linear regression
* Elastic net regression (Lasso vs. Ridge regression)
* AdaBoost regression

In [135]:
def bias(y_true, y_pred):
    """Return the estimated bias of the predictions
    (the average of prediction - truth)"""
    return (y_pred - y_true).mean()

In [136]:
scoring = dict(
    neg_mean_squared_error='neg_mean_squared_error',
    bias=metrics.make_scorer(bias),
    r_squared='r2'
)

In [140]:
x_in = X.loc[:, ('pacific', 'month', 'year', 'chlor_a_cln', 'sst_cln', 'months_below_mean', 'max', 'std', 'max_ca', 'std_ca')]

cv_mean = cross_validate(DecisionTreeRegressor(), X=x_in, y=y, 
    scoring=scoring, cv=10)
pd.DataFrame(cv_mean).describe()

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_bias,test_r_squared
count,10.0,10.0,10.0,10.0,10.0
mean,1.579717,0.013918,-1174.792027,-0.728817,0.112394
std,0.038973,0.001046,1245.545706,4.685046,0.601356
min,1.53677,0.011653,-4163.264713,-12.47358,-1.057341
25%,1.550893,0.013641,-1197.613592,-0.677026,-0.211776
50%,1.569473,0.01394,-829.718957,0.47512,0.338143
75%,1.594398,0.014318,-371.074892,1.956869,0.513364
max,1.648699,0.0157,-109.308781,3.762958,0.693193


## Summarize bias, variance, MSE