In [1]:
##general import statements
import numpy as np
import math
import xarray as xr 
import pickle 
import matplotlib.pyplot as plt
import scipy.stats

#import statements for GAM
import statsmodels.api as sm
from statsmodels.gam.api import GLMGam, BSplines

In [2]:
##this is a wrapper function I got from 
#https://stackoverflow.com/questions/41045752/using-statsmodel-estimations-with-scikit-learn-cross-validation-is-it-possible
##the purpose of including this is to cross validate models 

from sklearn.base import BaseEstimator, RegressorMixin

class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
        return self
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)

Next are import files for relevant data. 

In [3]:
infile = open("./1979ERAdata/gph_97.p", 'rb')
gph = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/gph_time_97.p", 'rb')
gph_time = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/temp850_97.p", 'rb')
temp = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/temp850_time_97.p", 'rb')
temp_time = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/gph_time_97.p", 'rb')
gph_time = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/gph_time_97.p", 'rb')
gph_time = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/gph_lat_97.p", 'rb')
gph_lat = pickle.load(infile)
infile.close()

infile = open("./1979ERAdata/gph_lon_97.p", 'rb')
gph_lon = pickle.load(infile)
infile.close()

infile = open("../EOFs/ehf100_latavg.p", 'rb')
ehf = pickle.load(infile)
infile.close()

In [4]:
infile = open("../Clusters/Fall2023Clustering/UW_lat.p", 'rb')
unweighted_lat = pickle.load(infile)
infile.close()

infile = open("../Clusters/Fall2023Clustering/UW_lon.p", 'rb') 
unweighted_lon = pickle.load(infile)
infile.close()

### Begin modifying data files. 

I am going to be utilizing cap (60-90N) gph anomalies and temperature anomalies over the great lakes region (an area of large significance). 

In [8]:
gph = xr.DataArray(data= gph, 
                       dims = ["time","lat","lon"],
                       coords = dict(
                           time = gph_time,
                           lat = unweighted_lat,
                           lon = unweighted_lon)
                      )
gph_D = gph.groupby('time.date').mean()

gph

In [9]:
#calculate daily mean values
daily_mean_gph = gph.groupby('time.dayofyear').mean()
#test gph anomaly
daily_anom_gph = gph.groupby('time.dayofyear') - daily_mean_gph

##straight gph cap values
g_cap = gph.sel(lat = slice(60,90))

#anomaly cap values
cap_mean_gph = g_cap.groupby('time.dayofyear').mean()
#test gph anomaly
cap_anom_gph = g_cap.groupby('time.dayofyear') - cap_mean_gph
#cap_anom_gph

In [10]:
temp = xr.DataArray(data= temp, 
                       dims = ["time","lat","lon"],
                       coords = dict(
                           time = temp_time,
                           lat = unweighted_lat,
                           lon = unweighted_lon)
                      )
temp_D = temp.groupby('time.date').mean()

#temp

In [11]:
#calculate daily mean values
daily_mean_temp = temp.groupby('time.dayofyear').mean()
#test gph anomaly
daily_anom_temp = temp.groupby('time.dayofyear') - daily_mean_temp

##straight gph cap values
t_cap = temp.sel(lat = slice(60,90))
##great lakes
t_gl = temp.sel(lat = slice(40,50), lon = slice(270,290))

#anomaly cap values
cap_mean_t= t_cap.groupby('time.dayofyear').mean()
#test gph anomaly
cap_anom_t = t_cap.groupby('time.dayofyear') - cap_mean_t

#anomaly cap values
gl_mean_t= t_gl.groupby('time.dayofyear').mean()
#test gph anomaly
gl_anom_t = t_gl.groupby('time.dayofyear') - gl_mean_t
#gl_anom_t

### Import and re-shape elliptical values. 

In [17]:
infile = open("../New_EllipseVals/ephi10_79.p", 'rb')
ephi10 = pickle.load(infile)
infile.close()

infile = open("../New_EllipseVals/ephi_ratio10_79.p", 'rb')
rat10 = pickle.load(infile)
infile.close()

infile = open("../New_EllipseVals/ephi_size10_79.p", 'rb')
size10 = pickle.load(infile)
infile.close()

infile = open("../New_EllipseVals/ephi_cenlat10_79.p", 'rb')
cenlat10 = pickle.load(infile)
infile.close()

infile = open("../New_EllipseVals/ephi_cenlon10_79.p", 'rb')
cenlon10 = pickle.load(infile)
infile.close()

infile = open("../New_EllipseVals/ephi_wind10_79.p", 'rb')
wind10 = pickle.load(infile)
infile.close()

In [18]:
## reduce temperature 
tem = np.nanmean(gl_anom_t.values, axis = 1)
tem = np.nanmean(tem, axis = 1) ##avg temp anomaly over great lakes

##reduce gph
gp = np.nanmean(cap_anom_gph.values, axis = 1)
gp = np.nanmean(gp, axis = 1) ##avg temp gphanomaly over cap

In [19]:
tmp = tem.reshape((40,608))
gp = gp.reshape((40,608))
ehf = ehf.reshape((40,608))

In [20]:
ephi10 = np.delete(ephi10,[480,481,482,483,605,606,607],1)
rat10 = np.delete(rat10,[480,481,482,483,605,606,607],1)
size10 = np.delete(size10,[480,481,482,483,605,606,607],1)
cenlat10 = np.delete(cenlat10,[480,481,482,483,605,606,607],1)
cenlon10 = np.delete(cenlon10,[480,481,482,483,605,606,607],1)
wind10 = np.delete(wind10,[480,481,482,483,605,606,607],1)
tp = np.delete(tmp,[480,481,482,483,605,606,607],1)
gp = np.delete(gp,[480,481,482,483,605,606,607],1)
eh = np.delete(ehf,[480,481,482,483,605,606,607],1)

In [21]:
wind = np.reshape(wind10, (24040))
rat = np.reshape(rat10, (24040))
size = np.reshape(size10, (24040))
cenlt = np.reshape(cenlat10, (24040))
cenln = np.reshape(cenlon10, (24040))
ephi = np.reshape(ephi10, (24040))
t_gr = np.reshape(tp, (24040))
ga = np.reshape(gp, (24040))
ef = np.reshape(eh, (24040))

In [23]:
import pandas as pd

data = {'wind': np.ndarray.tolist(wind),
        'ephi': np.ndarray.tolist(ephi),
        'rat': np.ndarray.tolist(rat),
        'size': np.ndarray.tolist(size),
        'cenlat': np.ndarray.tolist(cenlt),
        'cenlon': np.ndarray.tolist(cenln),
        'temp': np.ndarray.tolist(t_gr),
        'gph': np.ndarray.tolist(ga),
        'ehf': np.ndarray.tolist(ef)
        }
#data 
df = pd.DataFrame(data)
df = df.dropna()

In [25]:
df_norm = (df - df.mean())
df_norm

Unnamed: 0,wind,ephi,rat,size,cenlat,cenlon,temp,gph,ehf
0,-6.755238,65.831245,0.836379,-7.151521e+06,6.487432,36.438525,7.430293,212.363495,-2.444531
1,-5.870402,66.933537,0.856576,-6.924411e+06,6.274780,34.016915,7.735292,215.392914,-2.087977
2,-6.939473,68.504048,0.871121,-7.138010e+06,6.036575,34.632686,5.975726,228.373672,-1.853083
3,-6.355910,69.593414,0.834444,-6.888179e+06,5.571828,35.646373,4.503211,230.929001,0.096176
4,-6.579504,70.994034,0.825562,-6.984070e+06,4.792537,38.076526,3.258967,242.162445,-0.961175
...,...,...,...,...,...,...,...,...,...
24035,1.677782,14.108853,0.450217,-1.022376e+07,7.291250,-66.445667,-1.409524,-457.223709,-3.508984
24036,1.084232,13.284963,0.468212,-9.962268e+06,6.908584,-66.057719,-2.808830,-451.694840,-5.754546
24037,1.396219,13.453589,0.442685,-1.020009e+07,6.415998,-69.997842,-5.451125,-440.761337,-4.884084
24038,0.724142,13.244366,0.425600,-1.059983e+07,6.637029,-71.099052,-7.655380,-445.692185,-6.159323


In [26]:
x = df_norm[['ephi','rat','cenlat','cenlon','size','wind','gph','ehf']]
y = df_norm['temp']

In [27]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 

#VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = x.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(x.values, i) 
                          for i in range(len(x.columns))] 
  
print(vif_data)

  feature       VIF
0    ephi  1.159381
1     rat  1.418931
2  cenlat  3.181562
3  cenlon  1.073641
4    size  3.580568
5    wind  6.592053
6     gph  3.896808
7     ehf  1.221893


In [38]:
x_spline = df_norm[['wind', 'size']]
bs = BSplines(x_spline, df=[8,8], degree=[2,2])

In [44]:
gam_bs = GLMGam(y,x,smoother=bs)
res_bs = gam_bs.fit()

In [45]:
# with statsmodels
#x = sm.add_constant(x) # adding a constant
#predictions = res_bs.predict(x) 
print_model = res_bs.summary()

print(print_model)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                   temp   No. Observations:                23340
Model:                         GLMGam   Df Residuals:                 23319.00
Model Family:                Gaussian   Df Model:                        20.00
Link Function:               Identity   Scale:                          37.069
Method:                         PIRLS   Log-Likelihood:                -75269.
Date:                Wed, 06 Dec 2023   Deviance:                   8.6441e+05
Time:                        17:56:13   Pearson chi2:                 8.64e+05
No. Iterations:                     3   Pseudo R-squ. (CS):            0.02532
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ephi          -0.0007      0.001     -0.835      0.4