"I would just set up a loop around your original process and use indexing to eliminate one year from the data set at a time, find the coefficients based on the rest of the data set, apply the results to predict the signal in that year you eliminated from the training, store the predicted values, then move to the next year and repeat the process. Each year will have its own model, but the same algorithm, so in the end the statistics you get for skill will relate to how the algorithm Works in general. Each set of forecasts made for each year held back is an independent blind set of forecasts because it was unaware of what actually happened that year." 


... So! This, I guess, is my attempt at doing just that. 

In [2]:
import numpy as np
import math
import xarray as xr 
import pickle 
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate

import scipy.stats

In [3]:
##this is a wrapper function I got from 
#https://stackoverflow.com/questions/41045752/using-statsmodel-estimations-with-scikit-learn-cross-validation-is-it-possible
##the purpose of including this is to cross validate models 

from sklearn.base import BaseEstimator, RegressorMixin

class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
        return self
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)

In [4]:
infile = open("./1979ERAdata/gph_97.p", 'rb')
gph = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/gph_time_97.p", 'rb')
gph_time = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/temp850_97.p", 'rb')
temp = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/temp850_time_97.p", 'rb')
temp_time = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/gph_time_97.p", 'rb')
gph_time = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/gph_time_97.p", 'rb')
gph_time = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/gph_lat_97.p", 'rb')
gph_lat = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/gph_lon_97.p", 'rb')
gph_lon = pickle.load(infile)
infile.close()

In [4]:
infile = open("../Clusters/Fall2023Clustering/UW_lat.p", 'rb')
unweighted_lat = pickle.load(infile)
infile.close()

infile = open("../Clusters/Fall2023Clustering/UW_lon.p", 'rb') 
unweighted_lon = pickle.load(infile)
infile.close()

In [5]:
gph = xr.DataArray(data= gph, 
                       dims = ["time","lat","lon"],
                       coords = dict(
                           time = gph_time,
                           lat = unweighted_lat,
                           lon = unweighted_lon)
                      )
gph_D = gph.groupby('time.date').mean()

gph

In [6]:
#calculate daily mean values
daily_mean_gph = gph.groupby('time.dayofyear').mean()
#test gph anomaly
daily_anom_gph = gph.groupby('time.dayofyear') - daily_mean_gph

##straight gph cap values
g_cap = gph.sel(lat = slice(60,90))

#anomaly cap values
cap_mean_gph = g_cap.groupby('time.dayofyear').mean()
#test gph anomaly
cap_anom_gph = g_cap.groupby('time.dayofyear') - cap_mean_gph
cap_anom_gph

In [7]:
temp = xr.DataArray(data= temp, 
                       dims = ["time","lat","lon"],
                       coords = dict(
                           time = temp_time,
                           lat = unweighted_lat,
                           lon = unweighted_lon)
                      )
temp_D = temp.groupby('time.date').mean()

temp

In [8]:
#calculate daily mean values
daily_mean_temp = temp.groupby('time.dayofyear').mean()
#test gph anomaly
daily_anom_temp = temp.groupby('time.dayofyear') - daily_mean_temp

##straight gph cap values
t_cap = temp.sel(lat = slice(60,90))
##great lakes
t_gl = temp.sel(lat = slice(40,50), lon = slice(270,290))

#anomaly cap values
cap_mean_t= t_cap.groupby('time.dayofyear').mean()
#test gph anomaly
cap_anom_t = t_cap.groupby('time.dayofyear') - cap_mean_t

#anomaly cap values
gl_mean_t= t_gl.groupby('time.dayofyear').mean()
#test gph anomaly
gl_anom_t = t_gl.groupby('time.dayofyear') - gl_mean_t
gl_anom_t

In [9]:
infile = open("../New_EllipseVals/ephi10_79.p", 'rb')
ephi10 = pickle.load(infile)
infile.close()

infile = open("../New_EllipseVals/ephi_ratio10_79.p", 'rb')
rat10 = pickle.load(infile)
infile.close()

infile = open("../New_EllipseVals/ephi_size10_79.p", 'rb')
size10 = pickle.load(infile)
infile.close()

infile = open("../New_EllipseVals/ephi_cenlat10_79.p", 'rb')
cenlat10 = pickle.load(infile)
infile.close()

infile = open("../New_EllipseVals/ephi_cenlon10_79.p", 'rb')
cenlon10 = pickle.load(infile)
infile.close()

infile = open("../New_EllipseVals/ephi_wind10_79.p", 'rb')
wind10 = pickle.load(infile)
infile.close()

In [10]:
## reduce temperature 
tem = np.nanmean(gl_anom_t.values, axis = 1)
tem = np.nanmean(tem, axis = 1)

##reduce gph
gp = np.nanmean(cap_anom_gph.values, axis = 1)
gp = np.nanmean(gp, axis = 1)

In [11]:
tmp = tem.reshape((40,608))
gp = gp.reshape((40,608))

### Okay ... Done with standard data imports. Next few cells are related to reshaping the SSW data cells.


In [12]:
##SSW Averaging Timelines
ssw_wind = np.empty((21,141))
ssw_wind[:] = np.nan

ssw_rat = np.empty((21,141))
ssw_rat[:] = np.nan

ssw_size = np.empty((21,141))
ssw_size[:] = np.nan

ssw_cenlt = np.empty((21,141))
ssw_cenlt[:] = np.nan

ssw_cenln = np.empty((21,141))
ssw_cenln[:] = np.nan

ssw_ep = np.empty((21,141))
ssw_ep[:] = np.nan

ssw_t = np.empty((21,141))
ssw_t[:] = np.nan

ssw_g = np.empty((21,141))
ssw_g[:] = np.nan


year2 = [x for x in range(0,21,1)]
#sswcl3 = [0,1,2,4,5,7,8,9,19,19,21,22,23,24,26,27,28,29,30,33,38,39]
sswcl2 = [0,2,4,5,7,8,9,19,19,21,22,23,24,26,27,28,29,30,33,38,39]
#dates3 = [(423,564),(440,581),(76,217),(404,545),(188,329),(276,417),(92,233),(392,533),(120,261),(412,553),(385,526),(176,317),(256,397),(208,349),(268,409),(404,545),(396,537),(280,421),(344,485),(212,353),(356,497),(192,333)]
dates2 = [(423,564),(76,217),(404,545),(188,329),(276,417),(92,233),(392,533),(120,261),(412,553),(385,526),(176,317),(256,397),(208,349),(268,409),(404,545),(396,537),(280,421),(344,485),(212,353),(356,497),(192,333)]

for i in range(len(year2)):
    print(i)
    d1 = int(dates2[i][0])
    d2 = int(dates2[i][1])
    
    ##diagnostics
    
    w = wind10[sswcl2[i],d1:d2] #wind
    ssw_wind[i] = w[:]
    
    rat = rat10[sswcl2[i],d1:d2]
    ssw_rat[i] = rat[:]
    
    ct = cenlat10[sswcl2[i],d1:d2]
    ssw_cenlt[i] = ct[:]
    
    cn = cenlon10[sswcl2[i],d1:d2]
    ssw_cenln[i] = cn[:]
    
    sz = size10[sswcl2[i],d1:d2]
    ssw_size[i] = sz[:]
    
    ep = ephi10[sswcl2[i],d1:d2]
    ssw_ep[i] = ep[:]
    
    tm = tmp[sswcl2[i],d1+40:d2+40]
    ssw_t[i] = tm[:]
    
    g = gp[sswcl2[i],d1:d2]
    ssw_g[i] = g[:]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [13]:
wind = np.reshape(ssw_wind, (2961))
rat = np.reshape(ssw_rat, (2961))
size = np.reshape(ssw_size, (2961))
cenlt = np.reshape(ssw_cenlt, (2961))
cenln = np.reshape(ssw_cenln, (2961))
ephi = np.reshape(ssw_ep, (2961))
t_gr = np.reshape(ssw_t, (2961))
ga = np.reshape(ssw_g, (2961))

#### Alright ... so I have both the 2D and 1D modes? In full. I can try to loop through the years for a full model in the first and then just wind in the second?

In [14]:
import pandas as pd

data = {'wind': np.ndarray.tolist(wind),
        'ephi': np.ndarray.tolist(ephi),
        'rat': np.ndarray.tolist(rat),
        'size': np.ndarray.tolist(size),
        'cenlat': np.ndarray.tolist(cenlt),
        'cenlon': np.ndarray.tolist(cenln),
        'temp': np.ndarray.tolist(t_gr),
        'gph': np.ndarray.tolist(ga)
        }
#data 
df = pd.DataFrame(data)
df = df.dropna()
#df

In [15]:
df_norm = (df - df.mean())
df_norm

Unnamed: 0,wind,ephi,rat,size,cenlat,cenlon,temp,gph
0,19.063270,36.756502,1.028187,1.799900e+07,20.243832,-1.212374,-0.650433,-877.418181
1,19.274396,36.714047,1.122493,1.873192e+07,20.033953,-1.983156,-0.505426,-852.381651
2,19.424054,36.529468,1.136784,1.827774e+07,20.004745,-4.911689,-3.083539,-814.243590
3,19.605842,36.147582,1.146798,1.753461e+07,20.594206,-10.198843,-4.734416,-798.659132
4,19.955375,36.209820,1.210954,1.730227e+07,20.589557,-6.273338,-6.105084,-775.780089
...,...,...,...,...,...,...,...,...
2956,-9.300428,-37.041005,0.559747,-7.810315e+06,2.503620,-11.163396,-12.923319,209.639574
2957,-9.378691,-35.600581,0.619454,-7.611807e+06,2.550111,-13.138527,-11.260098,201.292955
2958,-8.505407,-34.810745,0.627233,-7.905518e+06,2.764038,-12.276707,-8.857008,195.907396
2959,-9.427237,-32.198373,0.603631,-7.676142e+06,3.040407,-11.022220,-6.042963,175.529344


In [16]:
x = df_norm[['ephi','rat','cenlat','cenlon','size','wind','gph']]
y = df_norm['temp']

In [17]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 

#VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = x.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(x.values, i) 
                          for i in range(len(x.columns))] 
  
print(vif_data)

  feature       VIF
0    ephi  1.182062
1     rat  2.026139
2  cenlat  3.204290
3  cenlon  1.160654
4    size  3.624683
5    wind  8.354392
6     gph  5.586190


#### Remove size and gph. import statsmodels.api as sm

In [18]:
import statsmodels.api as sm

In [19]:
x = df_norm[['ephi','rat','cenlat','cenlon','wind']]
y = df_norm['temp']

x2 = df_norm[['wind']]

In [66]:
##first do the full model
x1_temps = np.empty((21,2776))
x1_temps[:] = np.nan

for i in range(0,21,1):
    test_wind = ssw_wind[i,:] 
    test_ephi = ssw_ep[i,:] 
    test_rat = ssw_rat[i,:] 
    test_cenlt = ssw_cenlt[i,:] 
    test_cenln = ssw_cenln[i,:] 
    test_temp = ssw_t[i,:] 
    
    test_data = {'wind': np.ndarray.tolist(test_wind),
                'ephi': np.ndarray.tolist(test_ephi),
                'rat': np.ndarray.tolist(test_rat),
                'cenlat': np.ndarray.tolist(test_cenlt),
                'cenlon': np.ndarray.tolist(test_cenln),
                'temp': np.ndarray.tolist(test_temp)
                }
    df_test = pd.DataFrame(data)
    df_test = df.dropna()
    

    xx = df_test[['ephi','rat','cenlat','cenlon','wind']]
    yy = df_test['temp']
    
    train_wind = np.reshape(np.delete(ssw_wind,i,0),(2820,)) 
    train_ephi = np.reshape(np.delete(ssw_ep,i,0),(2820,)) 
    train_rat = np.reshape(np.delete(ssw_rat,i,0),(2820,))
    train_cenlt = np.reshape(np.delete(ssw_cenlt,i,0),(2820,)) 
    train_cenln = np.reshape(np.delete(ssw_cenln,i,0),(2820,))
    train_temp = np.reshape(np.delete(ssw_t,i,0),(2820,))
    
    train_data = {'wind': np.ndarray.tolist(train_wind),
                'ephi': np.ndarray.tolist(train_ephi),
                'rat': np.ndarray.tolist(train_rat),
                'cenlat': np.ndarray.tolist(train_cenlt),
                'cenlon': np.ndarray.tolist(train_cenln),
                'temp': np.ndarray.tolist(train_temp)
                }
    #data 
    df_train = pd.DataFrame(data)
    df_train = df.dropna()

    x1 = df_train[['ephi','rat','cenlat','cenlon','wind']]
    y1 = df_train['temp']
    

    # with statsmodels
    x1 = sm.add_constant(x1)
    xx = sm.add_constant(xx)# adding a constant
 
    model = sm.GLM(y1, x1).fit()
    prediction = model.predict(xx)
    pred = prediction.reset_index(drop=True)
    x1_temps[i,:] = pred.values[:]
    #print_model = model.summary()

In [67]:
##first do the full model
x2_temps = np.empty((21,2776))
x2_temps[:] = np.nan

for i in range(0,21,1):
    test_wind = ssw_wind[i,:]  
    test_temp = ssw_t[i,:] 
    
    test_data = {'wind': np.ndarray.tolist(test_wind),
                'temp': np.ndarray.tolist(test_temp)
                }
    df_test = pd.DataFrame(data)
    df_test = df.dropna()
    

    xx = df_test[['wind']]
    yy = df_test['temp']
    
    train_wind = np.reshape(np.delete(ssw_wind,i,0),(2820,)) 
    train_temp = np.reshape(np.delete(ssw_t,i,0),(2820,))
    
    train_data = {'wind': np.ndarray.tolist(train_wind),
                'temp': np.ndarray.tolist(train_temp)
                }
    #data 
    df_train = pd.DataFrame(data)
    df_train = df.dropna()
    

    x1 = df_train[['wind']]
    y1 = df_train['temp']
    

    # with statsmodels
    x1 = sm.add_constant(x1)
    xx = sm.add_constant(xx)# adding a constant
 
    model = sm.GLM(y1, x1).fit()
    prediction = model.predict(xx)
    pred = prediction.reset_index(drop=True)
    x2_temps[i,:] = pred.values[:]
    #print_model = model.summary()

In [68]:
y = df['temp']

In [69]:
##calculate R2 for modified model
r2 = []
for i in range(0,21,1):
    rss= []
    tss= []
    #print(i)
    for j in range(0,2776,1):
        #print(j)
        #print(x1_temps[i,j])
        rs= (y.values[j]-x1_temps[i,j])**2
        ts= (y.values[j]-np.mean(y.values[:]))**2
        rss.append(rs)
        tss.append(ts)
    r = 1 - (np.sum(rs)/np.sum(ts))
    r2.append(r)
        

In [70]:
##calculate R2 for Wind-Only Model
r2_wind = []
for i in range(0,21,1):
    rss= []
    tss= []
    #print(i)
    for j in range(0,2776,1):
        #print(j)
        #print(x1_temps[i,j])
        rs= (y.values[j]-x2_temps[i,j])**2
        ts= (y.values[j]-np.mean(y.values[:]))**2
        rss.append(rs)
        tss.append(ts)
    r = 1 - (np.sum(rs)/np.sum(ts))
    r2_wind.append(r)
        

In [71]:
print("The R2 of my modified model with cross validation is ", np.mean(r2))
print("The R2 of my wind-only model with cross validation is ", np.mean(r2_wind))

The R2 of my modified model with cross validation is  0.1890976987535767
The R2 of my wind-only model with cross validation is  -0.06941630080722927


In [72]:
##maybe let's try calculating mean absolute percentage error?
mape = []
mape_wind = []
for i in range(0,21,1):
    ind1 = []
    ind2 = []
    #print(i)
    for j in range(0,2776,1):
        #print(j)
        #print(x1_temps[i,j])
        inside1 = (y.values[j]-x1_temps[i,j])/y.values[j]
        ind1.append(inside1)
        inside2 = (y.values[j]-x2_temps[i,j])/y.values[j]
        ind2.append(inside2)
    m1 = (1/2776) * (np.sum(ind1))
    m2 = (1/2776) * (np.sum(ind2))
    mape.append(m1)
    mape_wind.append(m2)

In [73]:
ma = np.mean(mape)*100
ma

111.72995585295702

In [74]:
ma = np.mean(mape_wind)*100
ma

103.7427881688896