In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from xgboost import XGBRegressor
from sklearn.impute import KNNImputer
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
from statsmodels.tsa.stattools import acf
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score

import optuna
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

from darts import TimeSeries

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
gdpnow = pd.read_csv('train_set.csv', index_col='Dates', parse_dates=True) #date_parser=dateparse)
# Ensure that load_df index is in the same date format
gdpnow.index = pd.to_datetime(gdpnow.index)

In [9]:
#Target GDP
Target = 'Final_GDP_Interp'
Drop = ['GDP Nowcast', 'Advance Estimate From BEA', 'Publication Date of Advance Estimate',
      'Days until advance estimate', 'Forecast Error', 'Data releases', 'Quarter being forecasted']
#'Quarter being forecasted'

In [10]:
gdpnow['Quarter being forecasted'] = gdpnow['Quarter being forecasted'].fillna(method='ffill')
gdpnow = gdpnow.dropna(axis=0, thresh=len(gdpnow.columns)-2)

In [11]:
gdpnow['quar_fore_int'] = 0
quarter = 0
for i in range(len(gdpnow)-1):
    if gdpnow['Quarter being forecasted'].iloc[i+1] != \
        gdpnow['Quarter being forecasted'].iloc[i]:
            quarter = quarter + 1
    gdpnow['quar_fore_int'].iloc[i+1] = quarter

gdpnow

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdpnow['quar_fore_int'].iloc[i+1] = quarter


Unnamed: 0_level_0,value_CCLACBW027SBOG,value_WTISPLC,value_EXPINF1YR,value_STLPPM,value_M2REAL,value_UNRATE,value_PPIACO,value_PCUOMFGOMFG,value_PCUATRANSATRANS,value_PCUATRADEATRADE,...,PCE Services,Previous change in private inventories ($Bil 2009),Publication Date of Advance Estimate,Quarter being forecasted,Residential,S&L,Services exports,Services imports,Structures,quar_fore_int
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-05-01,608.7653,102.18,1.768390,0.227347,4778.4,6.3,208.000,197.200,125.500,124.600,...,3.9,70.0,30/07/2014,30/06/2014,4.6,1.4,2.5,4.0,1.8,0
2014-05-02,608.7653,102.18,1.768390,0.227347,4778.4,6.3,208.000,197.200,125.500,124.600,...,3.9,58.0,30/07/2014,30/06/2014,4.6,1.3,2.5,4.0,1.8,0
2014-05-06,608.7653,102.18,1.768390,0.227347,4778.4,6.3,208.000,197.200,125.500,124.600,...,3.9,58.0,30/07/2014,30/06/2014,4.5,1.3,2.7,3.3,1.8,0
2014-05-09,608.7653,102.18,1.768390,0.227347,4778.4,6.3,208.000,197.200,125.500,124.600,...,3.9,72.0,30/07/2014,30/06/2014,4.5,1.3,2.7,3.3,1.8,0
2014-05-12,608.7653,102.18,1.768390,0.227347,4778.4,6.3,208.000,197.200,125.500,124.600,...,3.9,72.0,30/07/2014,30/06/2014,4.5,1.3,2.7,3.3,1.8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-11,1033.2235,74.15,2.418633,0.940888,6710.6,3.7,251.328,244.078,158.986,173.411,...,2.6,77.8,25/01/2024,31/12/2023,0.1,3.6,5.6,5.7,2.1,38
2024-01-12,1033.2235,74.15,2.418633,0.940888,6710.6,3.7,251.328,244.078,158.986,173.411,...,2.6,77.8,25/01/2024,31/12/2023,0.1,3.6,5.6,5.7,2.0,38
2024-01-17,1035.5004,74.15,2.418633,0.940888,6710.6,3.7,251.328,244.078,158.986,173.411,...,2.4,77.8,25/01/2024,31/12/2023,0.2,3.6,5.6,5.7,2.0,38
2024-01-18,1035.5004,74.15,2.418633,0.940888,6710.6,3.7,251.328,244.078,158.986,173.411,...,2.4,77.8,25/01/2024,31/12/2023,-0.4,3.6,5.6,5.7,2.0,38


In [12]:
gdpnow = gdpnow.drop(columns=Drop)
df_target = gdpnow[[Target,'quar_fore_int']]

In [13]:
from sklearn.preprocessing import StandardScaler

preproc = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

In [14]:
gdpnow = pd.DataFrame(preproc.fit_transform(gdpnow), columns=gdpnow.columns)

In [15]:
grouped = gdpnow.groupby('quar_fore_int')

In [26]:
# Initialize list to store matrices
matrices = []

# Iterate over groups
for name, group in grouped:
    # Drop the 'Quarter being forecasted' column
    group = group.drop(columns='quar_fore_int')
    # Convert group to matrix
    matrix = group.to_numpy()
    # Append matrix to list
    matrices.append(matrix)

# Let us see what is the minimum days per quarter

days_per_quarter = [matrix.shape[0] for matrix in matrices]
days_per_quarter_min = min(days_per_quarter)
days_per_quarter_max = max(days_per_quarter)

# there are two weird quarters (one with 59 days and ine with 130 days)
# let us get rid of them

In [27]:
days_per_quarter_min, days_per_quarter_max

(31, 44)

In [13]:
# matrices =  [matrix for matrix in matrices \
#     if matrix.shape[0]>80 and matrix.shape[0]<110]

# days_per_quarter = [matrix.shape[0] for matrix in matrices]
# days_per_quarter_min = min(days_per_quarter)
# days_per_quarter_max = max(days_per_quarter)

# days_per_quarter_min, days_per_quarter_max

(85, 98)

In [25]:
fg = [0,1,2,3,4,5]
fg[-2:]

[4, 5]

In [28]:
matrices = [matrix[-days_per_quarter_min:][:] for matrix in matrices]

In [29]:
X = np.stack(matrices, axis=0)

In [30]:
X.shape

(39, 31, 67)

In [31]:
# For the last sequence I have no target
X = X[:-1,:,:]

In [32]:
# Ok we have our X. What about our y?
# Final_GDP_Interp quar_fore_int

# keep the row?
df_target['keep'] = 0

for i in range(len(df_target)-1):
    if df_target.quar_fore_int.iloc[i+1] != df_target.quar_fore_int.iloc[i]:
        df_target['keep'].iloc[i+1] = 1

y = df_target[df_target['keep'] == 1]
y.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target['keep'].iloc[i+1] = 1


(38, 3)

In [19]:
# # we have to get rid of the index 19 and 20 because they were excluded from X too

# y = pd.concat([y.iloc[0:19], y.iloc[21:]], axis='index')
# y = y['Final_GDP_Interp']
# y

Dates
2014-08-04     3.928261
2014-10-31     3.540109
2015-02-02     2.559438
2015-04-30     0.272500
2015-08-06     2.256154
2015-10-30     1.481304
2016-02-01     0.685000
2016-04-29     0.547473
2016-08-03     1.329565
2016-10-31     2.866044
2017-01-30     1.831099
2017-05-01     0.751978
2017-08-03     2.597692
2017-10-30     2.975824
2018-01-29     2.552088
2018-04-30     2.377363
2018-07-31     4.035385
2018-10-29     3.478160
2019-03-01     2.600175
2019-10-31     1.921739
2020-01-31     2.003778
2020-04-30    -5.085652
2020-07-31   -32.182826
2020-10-30    33.080000
2021-01-29     4.036154
2021-04-30     6.391209
2021-07-30     6.450769
2021-10-29     2.073516
2022-01-28     6.798791
2022-04-29    -1.404725
2022-07-29    -0.891538
2022-10-28     2.573516
2023-01-27     2.869890
2023-04-28     1.074835
2023-07-28     2.437143
2023-10-27     4.861412
Name: Final_GDP_Interp, dtype: float64

In [36]:
y=y.Final_GDP_Interp

In [37]:
y.shape, X.shape

((38,), (38, 31, 67))

In [38]:
y_e = np.expand_dims(np.expand_dims(y, axis=-1), axis=-1)

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

# 1- RNN Architecture
model = Sequential()

model.add(layers.LSTM(units=20, activation='tanh', input_shape=(31,67), return_sequences=True))
model.add(layers.LSTM(units=20, activation='tanh', return_sequences=True))
# model.add(layers.LSTM(units=100, activation='tanh', input_shape=(31,67), return_sequences=True))
model.add(layers.LSTM(units=20, activation='tanh', return_sequences=False))


model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dropout(0.2))
# model.add(layers.Dense(40, activation='relu'))
# model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='linear'))

# 2- Compilation

es = EarlyStopping(patience=15, restore_best_weights=True, monitor='val_mae')

model.compile(loss='mae',
              optimizer='rmsprop',
              metrics=['mae']) # very high lr so we can converge with such a small dataset

# 3- Fit
model.fit(X, y_e,
          epochs=75,
          batch_size=32,
          verbose=1,
          callbacks = [es],
          validation_split=0.2)

# 4- Predict
model.predict(X) # One prediction per city

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75


array([[ 2.9313703 ],
       [ 2.9116774 ],
       [ 2.9294798 ],
       [ 0.94980216],
       [ 2.1450026 ],
       [ 2.3456867 ],
       [ 1.237651  ],
       [ 0.9629355 ],
       [ 1.6957996 ],
       [ 2.9013183 ],
       [ 2.6272075 ],
       [ 2.870388  ],
       [ 2.8552046 ],
       [ 2.867117  ],
       [ 2.90395   ],
       [ 2.9036434 ],
       [ 2.8783178 ],
       [ 2.7907362 ],
       [ 2.8408737 ],
       [ 2.798944  ],
       [ 2.3285632 ],
       [ 2.710021  ],
       [ 2.053184  ],
       [-0.31156138],
       [-0.8344521 ],
       [ 2.8762841 ],
       [ 2.8678355 ],
       [ 2.8243427 ],
       [ 2.8239539 ],
       [ 2.8397057 ],
       [ 2.5594158 ],
       [ 0.21580511],
       [-0.28797236],
       [-0.20344622],
       [-0.23784336],
       [-0.17058924],
       [-0.29139325],
       [-0.23590145]], dtype=float32)