# Covalidation and predictability for selected models
# plots for manuscript

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stat
from scipy.stats import pearsonr
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import statsmodels.api as sm
    
import warnings
warnings.simplefilter('ignore') #filter some warning messages

In [None]:
# data
iy = 1991
fy = 2013
kelp = pd.read_csv('../data/BullKelp_summer_July2021.csv',index_col=0)
nkplus = kelp.loc[2014:2020,'North Kelp']
skplus = kelp.loc[2014:2020,'South Kelp']
kelp = kelp[kelp.index>=iy]# 1985
kelp = kelp[kelp.index<=fy]

env = pd.read_csv('../data/environmentaldata_seasonal_July2021.csv',index_col=0)
# first select winter 
a = pd.DatetimeIndex(env.index).month==1 # winter 
wenv = env[a]
a = pd.DatetimeIndex(wenv.index).year>=2014 # winter 
wenvplus_N14 = wenv.loc[a,'SSTN14']
wenvplus_sMOCI = wenv.loc[a,'SoCal MOCI']
wenvplus_nMOCI = wenv.loc[a,'NorCal MOCI']
wenvplus_cMOCI = wenv.loc[a,'CenCal MOCI']
wenv = wenv[pd.DatetimeIndex(wenv.index).year>=iy]
wenv = wenv[pd.DatetimeIndex(wenv.index).year<=fy]

# select spring
a = pd.DatetimeIndex(env.index).month==4 # spring 
senv = env[a]
a = pd.DatetimeIndex(senv.index).year>=2014 # spring 
senvplus_BEUTI37N = senv.loc[a,'BEUTI 37N']
senv = senv[pd.DatetimeIndex(senv.index).year>=iy]
senv = senv[pd.DatetimeIndex(senv.index).year<=fy]

seas = ['Winter','Spring','Summer']

***
# Northern California
## winter all MOCIs
## winter SSTN14

In [None]:
# covalidation for northern california
def linreg_fcov(x, y, yr, labx, laby):
    
    klp = pd.DataFrame(data=np.log(y), index=yr, columns=[laby])
    ocean = pd.DataFrame(data=x, index=yr, columns=[labx])
    ocean['Kelp'] = klp
    ocean =ocean[~np.isnan(ocean['Kelp'])]
    ocean = sm.add_constant(ocean)
    r2 = np.full((len(ocean)),np.nan)
    pred = np.full((len(ocean)),np.nan)
    
    for i, ii in enumerate(ocean.index):
        ocean2 = ocean.copy()
        
        # drop one value at a time
        ocean2 = ocean2[ocean2.index!=ii]
        klp2 = pd.DataFrame(data=ocean2['Kelp'].values, index=ocean2.index, columns=[laby])
        ocean2 = ocean2.drop(columns=['Kelp'])
        
        # first degree
        mods = sm.OLS(klp2,ocean2).fit()
        #print(mods.pvalues[1])
        
        # explained variance
        r2[i] = mods.rsquared
        # predicted value
        p = mods.params
        prv = p[0] + p[1]*ocean.at[ii,labx]
        pred[i] = np.abs(prv - klp.at[ii,laby])*100/klp.at[ii,laby]
        
        #print(pred[i])
        
    print('mean r2 = ', np.round(np.mean(r2),2))
    print('prediction for 1991-2013')
    print('mean pred error % = ', np.round(np.mean(pred),2))
    print('std pred error % = ',np.round(np.std(pred),2))
    
def linreg_simp(x, y, yr, nxs, labx, reg):
    
    ocean = pd.DataFrame(data=x, index=yr, columns=[labx])
    ocean['Kelp'] = np.log(y)
    ocean =ocean[~np.isnan(ocean['Kelp'])]
    klp = pd.DataFrame(data=ocean['Kelp'].values, index=ocean.index, columns=[laby])
    ocean = ocean.drop(columns=['Kelp'])
    ocean = sm.add_constant(ocean)
    
    mods = sm.OLS(klp,ocean).fit()
    p = mods.params
    prdv = np.full((len(nxs)),np.nan)
    
    for i,ii in enumerate(nxs):
        prdv[i] = p[0] + p[1]*ii
        
    nx = np.arange(ocean[labx].min(), ocean[labx].max()+0.01, 0.01).reshape(-1,1)
    polf = PolynomialFeatures(1)
    xp = polf.fit_transform(nx)
    lab='R$^2$='+str(np.round(mods.rsquared,2))
    print(mods.summary())
    
    # boundaries
    _, upper,lower = wls_prediction_std(mods)
    #lab='r2='+str(np.round(mods2.rsquared_adj,2))
    
    # scatter plot
    plt.figure(dpi=150)
    plt.scatter(x,y,60,c=yr, edgecolor='grey', cmap='plasma') # orginal data
    plt.colorbar(ticks=range(ocean.index[0],ocean.index[-1]+1,2), label='Year')
    plt.plot(nx,np.exp(mods.predict(xp)),'b-', label=lab) # prediction - line
    sorx = np.argsort(ocean[labx].values)
    plt.plot(ocean[labx].values[sorx],np.exp(upper.values[sorx]),':',c='r') # confid. intrvl
    plt.plot(ocean[labx].values[sorx],np.exp(lower.values[sorx]),':',c='r')
    plt.ylabel(laby+' (km$^2$)',fontsize=14)
    plt.yscale('log')
    plt.xlabel(labx,fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.legend(loc=0,fontsize=14)
    plt.savefig('../figures/'+reg+'_'+labx+'_1.png')
    plt.show()

    # time series, data, modeled, predicted
    plt.figure(dpi=150)
    plt.plot(ocean.index,np.exp(mods.predict(ocean)),'o-',c='k', label='Modeled', alpha=0.9)
    plt.plot([*range(2014,2021)],np.exp(prdv),'*-', c='grey', alpha=0.8, label='Predicted')
    #plt.plot(yr,y,'d-', c='tab:red', label="Measured")
    #plt.plot([*range(2014,2021)],nkplus+1,'d-', c='tab:red', alpha=0.6)
    kelp2 = pd.read_csv('../data/BullKelp_summer_July2021.csv',index_col=0)
    kelp2 = kelp2[kelp2.index>=iy]# 1985
    plt.plot(kelp2[laby]+1,'d-', c='tab:red', label="Measured")
    plt.yscale('log')
    plt.ylabel('North Kelp (km$^2$)')
    plt.xlabel('Year')
    plt.legend(loc=3, fontsize='small')
    plt.grid(True, alpha=0.3)
    plt.savefig('../figures/'+reg+'_'+labx+'_2.png')
    plt.show()
    
    
    return prdv

In [None]:
# Northern Kelp

# error in previctability (covalidation)
# predict 2014-2020

# winter NorCal MOCI, # winter CenCal MOCI, # winter SoCal MOCI, # winter SSTN14
labxs = ['winter_NorCalMOCI','winter_CenCalMOCI','winter_SoCalMOCI','winter_SSTN14']
laby = 'North Kelp'
wenvs = ['NorCal MOCI','CenCal MOCI','SoCal MOCI','SSTN14']


for j in range(len(labxs)):
    labx = labxs[j]
    print('\n\n',labx, laby)
    x = wenv[wenvs[j]].values
    y = kelp[laby].values
    yr = kelp.index.values
    linreg_fcov(x, y, yr,labx,laby)
    print('\nPredictive model')
    if j==0:
        ppr = linreg_simp(x, y, yr, wenvplus_nMOCI.values, labx,'NKelp')
    elif j==1:
        ppr = linreg_simp(x, y, yr, wenvplus_cMOCI.values, labx,'NKelp')
    elif j==2:
        ppr = linreg_simp(x, y, yr, wenvplus_sMOCI.values, labx,'NKelp')
    else:
        ppr = linreg_simp(x, y, yr, wenvplus_N14.values, labx,'NKelp')
    for i in range(len(ppr)):
        pper = np.round(np.abs(ppr[i] - np.log(nkplus.values[i]+1))*100/np.log(nkplus.values[i]+1),1)
        print('\nPrediction error for '+str(2014+i)+': '+str(pper))



***
# Southern California Kelp
## winterCenCal MOCI,  NorCal MOCI 
## winter SST14N, sprring BEUTI37N

In [None]:
# error in previctability (covalidation)
# predict 2014-2020

# winter NorCal MOCI, # winter CenCal MOCI, # winter SSTN14
labxs = ['winter_NorCalMOCI','winter_CenCalMOCI','winter_SSTN14','spring_BEUTI37N']
laby = 'South Kelp'
wenvs = ['NorCal MOCI','CenCal MOCI','SSTN14','BEUTI 37N']


for j in range(len(labxs)):
    labx = labxs[j]
    print('\n\n',labx, laby)
    if j<len(labxs)-1:
        print(wenvs[j])
        x = wenv[wenvs[j]].values
    else:
        print(wenvs[j])
        x = senv[wenvs[j]].values
    y = kelp[laby].values
    yr = kelp.index.values
    print('\nPredictive model')
    linreg_fcov(x, y, yr,labx,laby)
    if j==0:
        ppr = linreg_simp(x, y, yr, wenvplus_nMOCI.values, labx,'SKelp')
    elif j==1:
        ppr = linreg_simp(x, y, yr, wenvplus_cMOCI.values, labx,'SKelp')
    elif j==2:
        ppr = linreg_simp(x, y, yr, wenvplus_N14.values, labx,'SKelp')
    else:
        ppr = linreg_simp(x, y, yr, senvplus_BEUTI37N.values, labx,'SKelp')
    for i in range(len(ppr)):
        pper = np.round(np.abs(ppr[i] - np.log(skplus.values[i]+1))*100/np.log(skplus.values[i]+1),1)
        print('\nPrediction error for '+str(2014+i)+': '+str(pper))

