# Evaluate sample designs

In [1]:
from pathlib import Path
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib as matplotlib
import matplotlib.pyplot as plt
import datetime as dt
import shapely
from shapely.geometry import Point, Polygon, MultiPoint, MultiPolygon
import geopandas as gpd
import xarray as xr
import itertools
import scipy
import statsmodels.api as sm

from sklearn import cluster
from sklearn import neighbors
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate
from sklearn import metrics, cluster

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.dummy import DummyRegressor
from sklearn.model_selection import ShuffleSplit, GroupKFold, cross_validate, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, lasso_path

from sklearn.preprocessing import KBinsDiscretizer, StandardScaler

import cvxpy as cp



# Simulation 1 - PCO2 unrelated to remote sensing features

In [35]:
MEAN_EFFECT = 15
SD_EFFECT = 60
sim_rs = np.random.normal(MEAN_EFFECT, SD_EFFECT, 200)
sim_rs = pd.Series(sim_rs).agg(['mean', 'std'])
sim_srs = np.random.normal(MEAN_EFFECT, SD_EFFECT, 200)
sim_srs = pd.Series(sim_srs).agg(['mean', 'std'])
sim_dop = np.random.normal(MEAN_EFFECT, SD_EFFECT, 200)
sim_dop = pd.Series(sim_dop).agg(['mean', 'std'])

In [36]:
sim = pd.concat((sim_rs, sim_srs, sim_dop), axis=1)
sim.columns = ['rs', 'srs', 'dop']
sim

Unnamed: 0,rs,srs,dop
mean,16.939501,10.634237,15.030179
std,59.51635,63.243873,64.011626


# Simulation 2 - pCO2 as a function of remote sensing features

## Read in temp and color data

In [45]:
repo_path = Path('/Users/etriesch/dev/ocean-carbon-sampling/')
data_clean_path = repo_path / 'data/clean/'
data_raw_path = repo_path / 'data/raw/'
geo_crs = 'epsg:4326'
proj_crs = '+proj=cea'

In [62]:
# read in pc02
p_raw = pd.read_csv(data_clean_path / 'ship_pc02_monthly.csv.gz')

In [63]:
# read annual data
ta_raw = pd.read_csv(data_clean_path / 'sst_annual.csv')
ca_raw = pd.read_csv(data_clean_path / 'chlor_a_annual.csv')

# drop and rename columns
ca_cln = (ca_raw
          .rename(columns={'below_mean':'below_mean_ca', 
                           'months_below_mean':'months_below_mean_ca',
                          'bimodal_chl':'bimodal_ca'})
          .drop(columns=['drop_below_mean', 'pacific', 'lat', 'lon']))
ta_cln = (ta_raw.drop(columns=['pacific']))

In [109]:
# merge on x/y values
a_merge = pd.merge(left=ca_cln, right=ta_cln, on=['x', 'y', 'year'], suffixes=('_ca', '_ta'))
ma = pd.merge(left=p_raw, right=a_merge, on=['x', 'y', 'year'], suffixes=(None, None))

In [110]:
# scalarize annual variables
scale_cols = ['std_ca', 'max_ca', 'mean_ca', 'below_mean_ca', 'months_below_mean_ca', 'bimodal_ca', 'std_ta', 'max_ta', 'mean_ta']
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
scaler.fit(ma[scale_cols])
ma_scaled = pd.DataFrame(scaler.transform(ma[scale_cols]), columns=scale_cols)
ma_scaled = pd.merge(left=ma[['lat', 'lon', 'x', 'y', 'pco2_teq']], right=ma_scaled, left_index=True, right_index=True)

In [87]:
# read data
t_raw = pd.read_csv(data_clean_path / 'sst.csv')
c_raw = pd.read_csv(data_clean_path / 'chlor_a.csv')
# c_ann_raw = pd.read_csv(data_clean_path / 'chlor_a_annual.csv')

In [88]:
# merge on x/y values
m = pd.merge(left=c_raw, right=t_raw, how='inner', on=['x', 'y'], suffixes=('_c', '_t'))

In [111]:
# scalarize overall variables
scale_cols = ['std_c', 'max_c', 'mean_c', 'below_mean', 'months_below_mean', 'bimodal_chl', 'std_t', 'max_t', 'mean_t']
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
scaler.fit(m[scale_cols])
m_scaled = pd.DataFrame(scaler.transform(m[scale_cols]), columns=scale_cols)

m_scaled = pd.merge(left=m_sub[['lat', 'lon', 'x', 'y', 'pco2_teq']], right=m_scaled, left_index=True, right_index=True)

In [112]:
# make geodataframe
geo = [Point(lon, lat) for lat, lon in zip(m_scaled.lat, m_scaled.lon)]
geo_m = gpd.GeoDataFrame(m_scaled, geometry=geo, crs=geo_crs)

## Subset to sample zones

In [115]:
# load coastlines (saved locally)
boundary_fp = data_raw_path / 'stanford-vg541kt0643-shapefile.zip'
boundary = gpd.read_file(data_raw_path / boundary_fp).to_crs(geo_crs)

In [116]:
# Monterrey desal mask
ca_cent = [-121.788649, 36.802834]
ca_lats = [33.48, 39.48]
ca_lons = [-125.48, -119.48]
# Texas desal mask
tx_cent = [-95.311296, 28.927239]
tx_lats = [25.57, 31.57]
tx_lons = [-98.21, -92.21]
# NH desal mask
nh_cent = [-70.799678, 42.563588]
nh_lats = [39.38, 45.38]
nh_lons = [-73.50, -67.50]

In [117]:
# make disks
BUFFER = 1.5
ca_disc = gpd.GeoSeries(Point(ca_cent), crs=proj_crs).buffer(BUFFER).set_crs(geo_crs, allow_override=True)
ca_disc = gpd.GeoDataFrame(geometry=ca_disc)
tx_disc = gpd.GeoSeries(Point(tx_cent), crs=proj_crs).buffer(BUFFER).set_crs(geo_crs, allow_override=True)
tx_disc = gpd.GeoDataFrame(geometry=tx_disc)
nh_disc = gpd.GeoSeries(Point(nh_cent), crs=proj_crs).buffer(BUFFER).set_crs(geo_crs, allow_override=True)
nh_disc = gpd.GeoDataFrame(geometry=nh_disc)
# cut discs at coastal boundary
ca = ca_disc.overlay(boundary, how='difference')
tx = tx_disc.overlay(boundary, how='difference')
nh = nh_disc.overlay(boundary, how='difference')

In [118]:
# make sample zones
pac_sample = geo_m.overlay(ca, how='intersection')
atl_sample = geo_m.overlay(nh, how='intersection')
gul_sample = geo_m.overlay(tx, how='intersection')

## Simulate data

In [106]:
# run regression
response = 'pco2_teq'
fit_features = ['max_ta', 'std_ca', 'months_below_mean_ca', 'below_mean_ca']
X = sm.add_constant(geo_m[features], prepend=False)
lm = sm.OLS(endog=geo_m[response], exog=X, hasconst=True)
res = lm.fit()
res.params['const'] = MEAN_EFFECT
betas = res.params

In [145]:
# get population value
ocean = gul_sample
pred_features = ['max_t', 'std_c', 'months_below_mean', 'below_mean']
X_pred = sm.add_constant(ocean[pred_features], prepend=False)
yhat = X_pred @ betas.values + np.random.normal(0, SD_EFFECT, X_pred.shape[0])

In [146]:
sim_true = pd.Series((yhat.mean(), yhat.std()))
sim_true

0    20.605812
1    60.921297
dtype: float64

# Simulation 3 - pCO2 as a function of remote sensing features and other