In [1]:
import pandas as pd
from ast import literal_eval
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from pandas.core.common import flatten
import pickle 
import os
pd.options.mode.chained_assignment = None

In [2]:
#import original dataset for random forest
df_obs=pd.read_csv('../data/df_20210510.csv',index_col=0)
df_obs=df_obs.dropna()
df_obs=df_obs[['date','GPP_NT_VUT_REF']]
df_obs=df_obs[df_obs.index!='CN-Cng']

In [4]:
RF = pickle.load(open('RF_predictions_d10_n200.pkl', 'rb'))

In [None]:
DNN = pickle.load(open('pred_dnn.pkl', 'rb'))
LSTM= pickle.load(open('pred_lstm.pkl', 'rb'))
target= pickle.load(open('target_gpp.pkl', 'rb'))

In [7]:
target['LSTM']=LSTM['pred']
target['DNN']=list(flatten(DNN['preds']))
df_obs['RF']=list(flatten(RF['preds']))

In [8]:
df_ready=target
df_ready_rf=df_obs

In [9]:
sites=df_ready.index.unique()
df_ready['date'] = pd.to_datetime(df_ready.date)
df_ready['month'] = df_ready.date.map(lambda x: x.month)
df_ready['day'] = df_ready.date.map(lambda x: x.day)
df_ready['year'] = df_ready.date.map(lambda x: x.year)

In [18]:
df_ready_rf['date'] = pd.to_datetime(df_ready_rf.date)
df_ready_rf['month'] = df_ready_rf.date.map(lambda x: x.month)
df_ready_rf['day'] = df_ready_rf.date.map(lambda x: x.day)
df_ready_rf['year'] = df_ready_rf.date.map(lambda x: x.year)

In [19]:
df_gpp = [df_ready[df_ready.index == site]['GPP_NT_VUT_REF'] for site in sites] 
df_gpp = [(df_gpp[i]-df_gpp[i].mean())/df_gpp[i].std() for i in range(len(df_gpp))]
df_ready['GPP']=list(np.concatenate(df_gpp).flat)

In [43]:
#R2 for random forest
r2_rf=[]
for site in sites:
    df=df_ready_rf[df_ready_rf.index==site]
    df=df.dropna()
    r2_rf.append(r2_score(df['GPP_NT_VUT_REF'],df['RF']))

In [49]:
c=pd.DataFrame({"RF": r2_rf,
                   "sites": sites})        

In [50]:
c.to_csv("rf_R2_n200_d10.csv")

In [164]:
#for LSTM anomalies extreme condition
r2_norm = []
r2_uq = []
r2_lq = []
month = []
for site in sites:
    data = df_ready[df_ready.index == site]
    ms = data.groupby(['month','day']).mean()['GPP_NT_VUT_REF'].values
    data['day_year'] = data['date'].map(lambda x: x.timetuple().tm_yday)
    if(ms.shape[0] < 366 ): continue
    anomalies = data.apply(lambda x: x['GPP_NT_VUT_REF'] - ms[x['day_year'] -1], axis=1)
    uq = anomalies.quantile(q=0.95)
    lq = anomalies.quantile(q=0.05)
    gpp_uq = data['GPP'][anomalies >= uq]
    pred_uq = data['LSTM'][anomalies >= uq]
    gpp_lq = data['GPP'][anomalies <= lq]
    pred_lq = data['LSTM'][anomalies <= lq]
    norm = np.logical_and((anomalies > lq).values, (anomalies < uq ).values)
    gpp_norm = data['GPP'][norm]
    pred_norm = data['LSTM'][norm]
    monthly_gpp = data.groupby([ 'month']).mean()['GPP'].values
    monthly_pred = data.groupby([ 'month']).mean()['LSTM'].values
    month.append(mean_squared_error(monthly_gpp, monthly_pred, squared=False))
    r2_norm.append(np.mean(pred_norm-gpp_norm))
    r2_uq.append(np.mean(pred_uq-gpp_uq))
    r2_lq.append(np.mean(pred_lq-gpp_lq))
    

In [165]:
df = pd.DataFrame({"normal": r2_norm,
                   "upper_quantile": r2_uq,
                   "lower_quantile": r2_lq})        

In [169]:
df.to_csv("lstm_extreme_condition.csv")

In [170]:
#for LSTM different time scale
monthly=[]
yearly=[]
r2_monthly=[]
r2_yearly=[]
r2_daily=[]
for site in sites:
    data = df_ready[df_ready.index == site]
    data=data.dropna()
    m= data.groupby(['month','year']).mean()['GPP'].values # map to month mean 
    y= data.groupby(['year']).mean()['GPP'].values # map to year mean
    m_pred= data.groupby(['month','year']).mean()['LSTM'].values
    y_pred= data.groupby(['year']).mean()['LSTM'].values
    r2_daily.append(r2_score(data['GPP'],data['LSTM']))
    r2_monthly.append(r2_score(m,m_pred))
    if(len(y)>8):
        r2_yearly.append(r2_score(y,y_pred))

In [171]:
daily_lstm=np.mean(r2_daily)
yearly_lstm=np.mean(r2_yearly)
monthly_lstm=np.mean(r2_monthly)

In [175]:
#for DNN extreme conditions
r2_norm = []
r2_uq = []
r2_lq = []
month = []
for site in sites:
    data = df_ready[df_ready.index == site]
    ms = data.groupby(['month','day']).mean()['GPP_NT_VUT_REF'].values
    data['day_year'] = data['date'].map( lambda x: x.timetuple().tm_yday)
    if(ms.shape[0] < 366 ): continue
    anomalies = data.apply(lambda x: x['GPP_NT_VUT_REF'] - ms[x['day_year'] -1], axis=1)
    uq = anomalies.quantile(q=0.95)
    lq = anomalies.quantile(q=0.05)
    gpp_uq = data['GPP'][anomalies >= uq]
    pred_uq = data['DNN'][anomalies >= uq]
    gpp_lq = data['GPP'][anomalies <= lq]
    pred_lq = data['DNN'][anomalies <= lq]
    norm = np.logical_and((anomalies > lq).values, (anomalies < uq ).values)
    gpp_norm = data['GPP'][norm]
    pred_norm = data['DNN'][norm]
    monthly_gpp = data.groupby([ 'month']).mean()['GPP'].values
    monthly_pred = data.groupby([ 'month']).mean()['DNN'].values
    month.append(mean_squared_error(monthly_gpp, monthly_pred, squared=False))
    r2_norm.append(np.mean(pred_norm-gpp_norm))
    r2_uq.append(np.mean(pred_uq-gpp_uq))
    r2_lq.append(np.mean(pred_lq-gpp_lq))

In [176]:
df = pd.DataFrame({"normal": r2_norm,
                   "upper_quantile": r2_uq,
                   "lower_quantile": r2_lq})        

In [177]:
df.to_csv("dnn_extreme_condition.csv")

In [182]:
#for DNN different time scale
monthly=[]
yearly=[]
r2_monthly=[]
r2_yearly=[]
r2_daily=[]
for site in sites:
    data = df_ready[df_ready.index == site]
    data=data.dropna()
#     ms = data.groupby(['month','day']).mean()['GPP'].values #map to month and days
    m= data.groupby(['month','year']).mean()['GPP'].values # map to month mean 
    y= data.groupby(['year']).mean()['GPP'].values # map to year mean
#     ms_pred = data.groupby(['month','day']).mean()['LSTM'].values
    m_pred= data.groupby(['month','year']).mean()['DNN'].values
    y_pred= data.groupby(['year']).mean()['DNN'].values
    r2_daily.append(r2_score(data['GPP'],data['DNN']))
    r2_monthly.append(r2_score(m,m_pred))
    if(len(y)>8):
        r2_yearly.append(r2_score(y,y_pred))

In [183]:
daily_DNN=np.mean(r2_daily)
monthly_DNN=np.mean(r2_monthly)
yearly_DNN=np.mean(r2_yearly)

In [186]:
#for RF extreme condition
r2_norm = []
r2_uq = []
r2_lq = []
month = []
for site in sites:
    data = df_ready_rf[df_ready_rf.index == site]
    ms = data.groupby(['month','day']).mean()['GPP_NT_VUT_REF'].values
    data['day_year'] = data['date'].map( lambda x: x.timetuple().tm_yday)
    if(ms.shape[0] < 366 ): continue
    anomalies = data.apply(lambda x: x['GPP_NT_VUT_REF'] - ms[x['day_year'] -1], axis=1)
    uq = anomalies.quantile(q=0.95)
    lq = anomalies.quantile(q=0.05)
    gpp_uq = data['GPP_NT_VUT_REF'][anomalies >= uq]
    pred_uq = data['RF'][anomalies >= uq]
    gpp_lq = data['GPP_NT_VUT_REF'][anomalies <= lq]
    pred_lq = data['RF'][anomalies <= lq]
    norm = np.logical_and((anomalies > lq).values, (anomalies < uq ).values)
    gpp_norm = data['GPP_NT_VUT_REF'][norm]
    pred_norm = data['RF'][norm]
    monthly_gpp = data.groupby(['month']).mean()['GPP_NT_VUT_REF'].values
    monthly_pred = data.groupby(['month']).mean()['RF'].values
    month.append(mean_squared_error(monthly_gpp, monthly_pred, squared=False))
    r2_norm.append(np.mean(pred_norm-gpp_norm))
    r2_uq.append(np.mean(pred_uq-gpp_uq))
    r2_lq.append(np.mean(pred_lq-gpp_lq))

In [187]:
df = pd.DataFrame({"normal": r2_norm,
                   "upper_quantile": r2_uq,
                   "lower_quantile": r2_lq})        

In [188]:
df.to_csv("rf_extreme_condition_n200_d10.csv")

In [38]:
#for RF different time scale
monthly=[]
yearly=[]
r2_monthly=[]
r2_yearly=[]
r2_daily=[]
for site in sites:
    data = df_ready_rf[df_ready_rf.index == site]
    data=data.dropna()
    m= data.groupby(['month','year']).mean()['GPP_NT_VUT_REF'].values # map to month mean 
    y= data.groupby(['year']).mean()['GPP_NT_VUT_REF'].values # map to year mean
    m_pred= data.groupby(['month','year']).mean()['RF'].values
    y_pred= data.groupby(['year']).mean()['RF'].values
    r2_daily.append(r2_score(data['GPP_NT_VUT_REF'],data['RF']))
    r2_monthly.append(r2_score(m,m_pred))
    if(len(y)>8):
        r2_yearly.append(r2_score(y,y_pred))

In [39]:
daily_rf=np.mean(r2_daily)
monthly_rf=np.mean(r2_monthly)
yearly_rf=np.mean(r2_yearly)

In [200]:
#dont run aggregation alone, need to run the corresponding model first 
#as the array are named the same 
#this is outputing the file for the aggregations
d_out=pd.DataFrame({"daily_rf":daily_rf,"monthly_rf":monthly_rf,"yearly_rf":yearly_rf,
      "daily_DNN":daily_DNN,"monthly_DNN":monthly_DNN,"yearly_DNN":yearly_DNN,
      "daily_lstm":daily_lstm,"monthly_lstm":monthly_lstm,"yearly_lstm":yearly_lstm}, index=[0])

In [201]:
d_out.to_csv('aggregating to different time scales.csv')