<a href="https://colab.research.google.com/github/doogzultz/time_series_project/blob/main/preprocessing_and_model_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [822]:
import pandas as pd
import numpy as np
import tensorflow as tf
from pandas.plotting import scatter_matrix
from tensorflow.keras.layers import Layer, Dense, Dropout, Input, Flatten, Concatenate, Conv1D, LSTM, Bidirectional, BatchNormalization, Activation, TimeDistributed, Lambda
from tensorflow.keras.models import Model
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.utils import shuffle
from sklearn.compose import ColumnTransformer
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [823]:
class g:
  window_size = 16
  batch_size = 12

In [824]:
url = "https://raw.githubusercontent.com/doogzultz/time_series_project/main/data.csv"
data = pd.read_csv(url)

In [825]:
data.columns

Index(['date', 'identifier', 'market_cap', 'sector', 'index_membership',
       'factor_1', 'factor_2', 'factor_3', 'factor_4', 'factor_5', 'factor_6',
       'factor_7', 'factor_8', 'factor_9', 'factor_10', 'target'],
      dtype='object')

In [826]:
data.drop(columns = 'index_membership', inplace = True)

In [827]:
data.sort_values(by = ['identifier', 'date'], inplace = True)
data

Unnamed: 0,date,identifier,market_cap,sector,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,factor_8,factor_9,factor_10,target
82,2010-01-06,AD41WBQFVG43,5.843968e+10,35,0.766639,-0.000032,0.832677,0.500608,0.193489,0.246089,0.753702,0.792736,-0.000005,0.109029,-0.000006
400,2010-01-13,AD41WBQFVG43,4.758865e+10,35,0.907349,-0.022933,0.346457,0.581694,0.648483,0.872304,0.723412,0.333230,-0.009504,0.121348,-0.008721
718,2010-01-20,AD41WBQFVG43,3.374757e+10,35,0.476465,-0.000114,0.592328,0.379747,0.810961,0.017737,0.619208,0.058918,-0.000099,0.890898,-0.000081
1036,2010-01-27,AD41WBQFVG43,4.908395e+10,35,0.265141,-0.000034,0.521405,0.682492,0.547349,0.155915,0.068966,0.699718,-0.000037,0.355059,-0.000018
1354,2010-02-03,AD41WBQFVG43,2.729189e+10,35,0.423987,0.000197,0.950500,0.628533,0.956287,0.426634,0.752799,0.961276,0.000163,0.196711,0.000318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12552,2010-10-06,ZMYXRS4KVOG2,6.504496e+08,25,0.069622,0.009936,0.588354,0.447469,0.353549,0.767779,0.790632,0.308049,0.012255,0.281193,0.005117
12862,2010-10-13,ZMYXRS4KVOG2,1.403710e+09,25,0.343444,0.002566,0.394802,0.143162,0.116762,0.744089,0.825323,0.559633,0.001058,0.755124,0.000446
13172,2010-10-20,ZMYXRS4KVOG2,7.284687e+08,25,0.857557,0.000146,0.892729,0.113346,0.574870,0.990100,0.319424,0.591153,0.000054,0.933679,0.000035
13483,2010-10-27,ZMYXRS4KVOG2,7.766592e+08,25,0.093894,-0.000885,0.311392,0.768537,0.589851,0.289180,0.431484,0.373209,-0.000411,0.225972,-0.000242


In [828]:
(data== 0).sum(axis=0)

date           0
identifier     0
market_cap     0
sector         0
factor_1       0
factor_2      68
factor_3       0
factor_4       0
factor_5       0
factor_6       0
factor_7       0
factor_8       0
factor_9      68
factor_10      0
target        68
dtype: int64

In [829]:
ident_counts= data.value_counts('identifier')
ident_counts

identifier
AD41WBQFVG43    104
UZ4DWDZ8ALZ4    104
KR47536Y10D4    104
KPJ8FTV9ESJ3    104
KMHQ727PU8E2    104
               ... 
PNUM57CYEB27      5
KHKNKSMXBWP3      5
LYCPQJ19AOY0      5
RENR64FT3I74      5
LALJLSN9UMP2      2
Length: 356, dtype: int64

In [830]:
data.value_counts('sector', normalize = True).sort_index()

sector
10    0.049221
15    0.093162
20    0.208527
25    0.132849
30    0.072759
35    0.049003
40    0.232812
45    0.051674
50    0.038352
55    0.071642
dtype: float64

In [831]:
data[data['identifier'].isin(list(ident_counts[ident_counts.values > 80].index))].value_counts('sector', normalize = True).sort_index()

sector
10    0.052423
15    0.093655
20    0.212044
25    0.126521
30    0.076618
35    0.045433
40    0.228443
45    0.052423
50    0.039048
55    0.073392
dtype: float64

In [832]:
data[data['identifier'].isin(list(ident_counts[ident_counts.values <= 80].index))].value_counts('sector', normalize = True).sort_index()

sector
10    0.010229
15    0.087152
20    0.165712
25    0.209902
30    0.025777
35    0.092471
40    0.286007
45    0.042553
50    0.029869
55    0.050327
dtype: float64

In [833]:
sector_onehot = pd.get_dummies(data['sector'], prefix='sector')
sector_onehot

Unnamed: 0,sector_10,sector_15,sector_20,sector_25,sector_30,sector_35,sector_40,sector_45,sector_50,sector_55
82,0,0,0,0,0,1,0,0,0,0
400,0,0,0,0,0,1,0,0,0,0
718,0,0,0,0,0,1,0,0,0,0
1036,0,0,0,0,0,1,0,0,0,0
1354,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
12552,0,0,0,1,0,0,0,0,0,0
12862,0,0,0,1,0,0,0,0,0,0
13172,0,0,0,1,0,0,0,0,0,0
13483,0,0,0,1,0,0,0,0,0,0


In [834]:
target = data['target']
data.drop(columns = ['sector', 'target'], inplace = True)
data = pd.concat([data, sector_onehot, target], axis = 1)
data

Unnamed: 0,date,identifier,market_cap,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,...,sector_15,sector_20,sector_25,sector_30,sector_35,sector_40,sector_45,sector_50,sector_55,target
82,2010-01-06,AD41WBQFVG43,5.843968e+10,0.766639,-0.000032,0.832677,0.500608,0.193489,0.246089,0.753702,...,0,0,0,0,1,0,0,0,0,-0.000006
400,2010-01-13,AD41WBQFVG43,4.758865e+10,0.907349,-0.022933,0.346457,0.581694,0.648483,0.872304,0.723412,...,0,0,0,0,1,0,0,0,0,-0.008721
718,2010-01-20,AD41WBQFVG43,3.374757e+10,0.476465,-0.000114,0.592328,0.379747,0.810961,0.017737,0.619208,...,0,0,0,0,1,0,0,0,0,-0.000081
1036,2010-01-27,AD41WBQFVG43,4.908395e+10,0.265141,-0.000034,0.521405,0.682492,0.547349,0.155915,0.068966,...,0,0,0,0,1,0,0,0,0,-0.000018
1354,2010-02-03,AD41WBQFVG43,2.729189e+10,0.423987,0.000197,0.950500,0.628533,0.956287,0.426634,0.752799,...,0,0,0,0,1,0,0,0,0,0.000318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12552,2010-10-06,ZMYXRS4KVOG2,6.504496e+08,0.069622,0.009936,0.588354,0.447469,0.353549,0.767779,0.790632,...,0,0,1,0,0,0,0,0,0,0.005117
12862,2010-10-13,ZMYXRS4KVOG2,1.403710e+09,0.343444,0.002566,0.394802,0.143162,0.116762,0.744089,0.825323,...,0,0,1,0,0,0,0,0,0,0.000446
13172,2010-10-20,ZMYXRS4KVOG2,7.284687e+08,0.857557,0.000146,0.892729,0.113346,0.574870,0.990100,0.319424,...,0,0,1,0,0,0,0,0,0,0.000035
13483,2010-10-27,ZMYXRS4KVOG2,7.766592e+08,0.093894,-0.000885,0.311392,0.768537,0.589851,0.289180,0.431484,...,0,0,1,0,0,0,0,0,0,-0.000242


In [835]:
import math
from math import floor as fl
ident_counts= data.value_counts('identifier')
def split_to_series(df, valid_size = .1, test_size = .1,length_cutoff = 4*g.window_size):
  ident_counts= df.value_counts('identifier')
  df_long = df[df['identifier'].isin(list(ident_counts[ident_counts.values > length_cutoff].index))]
  df_short = df[df['identifier'].isin(list(ident_counts[(ident_counts.values <= length_cutoff) & (ident_counts.values >= 2*g.window_size)].index))]
  series_dict = {}
  train_dict = {}
  valid_dict = {}
  test_dict = {}
  for i, ident in enumerate(df_long.value_counts('identifier').index):
    series_dict[i] = df_long[df_long['identifier'] == ident].sort_values('date').reset_index().drop(columns = 'index')
    train_dict[i] = series_dict[i].iloc[:fl((1-valid_size - test_size)*len(series_dict[i])) - 2*g.window_size]
    valid_dict[i] = series_dict[i].iloc[fl((1-valid_size - test_size)*len(series_dict[i]))- 2*g.window_size:fl((1- test_size)*len(series_dict[i]))- g.window_size]
    test_dict[i] = series_dict[i].iloc[fl((1- test_size)*len(series_dict[i]))- g.window_size:]
  for j, ident in enumerate(df_short.value_counts('identifier').index):
    train_dict[len(series_dict)+j] = df_short[df_short['identifier'] == ident].sort_values('date').reset_index().drop(columns = 'index')
  print(f"Number of time series to be trained and tested: {len([keys for keys in series_dict])}")
  print(f"Number of time series to be trained: {len([keys for keys in train_dict])}")
  return train_dict, valid_dict, test_dict

In [836]:
train_dict, valid_dict, test_dict = split_to_series(data)

Number of time series to be trained and tested: 295
Number of time series to be trained: 323


In [837]:
train_dict[294]

Unnamed: 0,date,identifier,market_cap,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,...,sector_15,sector_20,sector_25,sector_30,sector_35,sector_40,sector_45,sector_50,sector_55,target
0,2010-08-11,FMOHK7CPCLT3,2848643000.0,0.798835,-0.000106,0.166318,0.408212,0.077306,0.540514,0.036312,...,0,0,0,0,0,1,0,0,0,-8.3e-05
1,2010-08-18,FMOHK7CPCLT3,896859900.0,0.214806,-0.00077,0.660014,0.90478,0.788543,0.824618,0.383134,...,0,0,0,0,0,1,0,0,0,-0.001152
2,2010-08-25,FMOHK7CPCLT3,1622673000.0,0.38102,0.036322,0.361238,0.509406,0.513737,0.088438,0.597968,...,0,0,0,0,0,1,0,0,0,0.021944
3,2010-09-01,FMOHK7CPCLT3,3533210000.0,0.164807,0.000647,0.011318,0.502661,0.320518,0.654605,0.161958,...,0,0,0,0,0,1,0,0,0,0.00225
4,2010-09-08,FMOHK7CPCLT3,1563805000.0,0.274727,0.010258,0.78438,0.291993,0.903812,0.167277,0.316741,...,0,0,0,0,0,1,0,0,0,0.006687
5,2010-09-15,FMOHK7CPCLT3,1312669000.0,0.03401,0.000218,0.287906,0.068714,0.363024,0.549292,0.910582,...,0,0,0,0,0,1,0,0,0,0.000112
6,2010-09-22,FMOHK7CPCLT3,2780770000.0,0.989591,-0.003323,0.205294,0.527446,0.137298,0.175086,0.913233,...,0,0,0,0,0,1,0,0,0,-0.003449
7,2010-09-29,FMOHK7CPCLT3,3154707000.0,0.257527,-0.002062,0.500003,0.290278,0.700611,0.599164,0.148275,...,0,0,0,0,0,1,0,0,0,-0.001729
8,2010-10-06,FMOHK7CPCLT3,3255331000.0,0.253269,-0.000536,0.612562,0.862434,0.692755,0.495828,0.787133,...,0,0,0,0,0,1,0,0,0,-0.000663
9,2010-10-13,FMOHK7CPCLT3,2660440000.0,0.959284,-8.8e-05,0.198665,0.430492,0.704453,0.384669,0.352189,...,0,0,0,0,0,1,0,0,0,-0.000101


In [838]:
valid_dict[294]

Unnamed: 0,date,identifier,market_cap,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,...,sector_15,sector_20,sector_25,sector_30,sector_35,sector_40,sector_45,sector_50,sector_55,target
20,2010-12-29,FMOHK7CPCLT3,923573200.0,0.687551,1.5e-05,0.915746,0.132671,0.329319,0.908277,0.427619,...,0,0,0,0,0,1,0,0,0,1.5e-05
21,2011-01-05,FMOHK7CPCLT3,1561372000.0,0.916147,5.2e-05,0.118025,0.381868,0.900137,0.645844,0.600291,...,0,0,0,0,0,1,0,0,0,5.6e-05
22,2011-01-12,FMOHK7CPCLT3,1710674000.0,0.428958,0.000821,0.409198,0.85798,0.369043,0.614363,0.889826,...,0,0,0,0,0,1,0,0,0,0.000234
23,2011-01-19,FMOHK7CPCLT3,1396937000.0,0.251359,0.009951,0.154886,0.133334,0.849951,0.300134,0.923037,...,0,0,0,0,0,1,0,0,0,0.043727
24,2011-01-26,FMOHK7CPCLT3,608230100.0,0.858637,0.00249,0.366274,0.880441,0.409873,0.166955,0.809363,...,0,0,0,0,0,1,0,0,0,0.002568
25,2011-02-02,FMOHK7CPCLT3,1533689000.0,0.569468,-3.8e-05,0.431303,0.976537,0.344242,0.109536,0.531197,...,0,0,0,0,0,1,0,0,0,-5.7e-05
26,2011-02-09,FMOHK7CPCLT3,2587540000.0,0.460394,-0.00127,0.093151,0.529173,0.804614,0.756791,0.592963,...,0,0,0,0,0,1,0,0,0,-0.001977
27,2011-02-16,FMOHK7CPCLT3,1121523000.0,0.201169,-7.4e-05,0.630244,0.872175,0.856211,0.034487,0.583241,...,0,0,0,0,0,1,0,0,0,-6e-05
28,2011-02-23,FMOHK7CPCLT3,2346798000.0,0.240702,-0.000813,0.169645,0.442821,0.470634,0.385678,0.124181,...,0,0,0,0,0,1,0,0,0,-0.000388
29,2011-03-02,FMOHK7CPCLT3,1125383000.0,0.782713,0.000146,0.217026,0.272463,0.217718,0.585434,0.653876,...,0,0,0,0,0,1,0,0,0,5.8e-05


In [839]:
test_dict[294]

Unnamed: 0,date,identifier,market_cap,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,...,sector_15,sector_20,sector_25,sector_30,sector_35,sector_40,sector_45,sector_50,sector_55,target
42,2011-06-01,FMOHK7CPCLT3,1295900000.0,0.772408,-0.000154,0.09425,0.631536,0.174941,0.384043,0.092641,...,0,0,0,0,0,1,0,0,0,-0.00017
43,2011-06-08,FMOHK7CPCLT3,1969514000.0,0.597463,-0.016755,0.252485,0.078943,0.68197,0.144649,0.611581,...,0,0,0,0,0,1,0,0,0,-0.007673
44,2011-06-15,FMOHK7CPCLT3,285124700.0,0.474382,-0.005034,0.482968,0.752567,0.966634,0.362801,0.141626,...,0,0,0,0,0,1,0,0,0,-0.005364
45,2011-06-22,FMOHK7CPCLT3,952517700.0,0.013793,-4.5e-05,0.924826,0.014213,0.27724,0.289064,0.15089,...,0,0,0,0,0,1,0,0,0,-5.2e-05
46,2011-06-29,FMOHK7CPCLT3,489468700.0,0.011589,-0.001354,0.836333,0.019921,0.929111,0.161085,0.195178,...,0,0,0,0,0,1,0,0,0,-0.003211
47,2011-07-06,FMOHK7CPCLT3,822810700.0,0.324533,-0.000682,0.055669,0.096428,0.675421,0.000487,0.20167,...,0,0,0,0,0,1,0,0,0,-0.000496
48,2011-07-13,FMOHK7CPCLT3,960608800.0,0.575096,-0.00036,0.854678,0.558345,0.335224,0.728194,0.658082,...,0,0,0,0,0,1,0,0,0,-0.000174
49,2011-07-20,FMOHK7CPCLT3,1379855000.0,0.823855,-0.002023,0.138596,0.523724,0.5389,0.997724,0.872698,...,0,0,0,0,0,1,0,0,0,-0.004088
50,2011-07-27,FMOHK7CPCLT3,1372419000.0,0.192977,-0.026612,0.519379,0.286826,0.490035,0.319139,0.610412,...,0,0,0,0,0,1,0,0,0,-0.001
51,2011-08-03,FMOHK7CPCLT3,1353034000.0,0.592175,-0.01215,0.752065,0.954898,0.514077,0.952007,0.966796,...,0,0,0,0,0,1,0,0,0,-0.004335


In [840]:
numerical_features = ['market_cap']+list(data.loc[:,'factor_1':'factor_10'].columns)
numerical_features

['market_cap',
 'factor_1',
 'factor_2',
 'factor_3',
 'factor_4',
 'factor_5',
 'factor_6',
 'factor_7',
 'factor_8',
 'factor_9',
 'factor_10']

In [841]:
def preprocess(train_dict, valid_dict, test_dict, numerical_features):
  train_df = pd.concat(train_dict, ignore_index = True)
  t_mean = np.mean(train_df[numerical_features])
  t_std = np.std(train_df[numerical_features])
  for i in range(0,len(train_dict)):
    train_dict[i][numerical_features] = (train_dict[i][numerical_features]-t_mean)/t_std
  for i in range(0,len(valid_dict)):
    valid_dict[i][numerical_features] = (valid_dict[i][numerical_features]-t_mean)/t_std
    test_dict[i][numerical_features] = (test_dict[i][numerical_features]-t_mean)/t_std

  return train_dict, valid_dict, test_dict


In [842]:
train_scaled, valid_scaled, test_scaled = preprocess(train_dict, valid_dict, test_dict, numerical_features)

In [843]:
train_scaled[0]

Unnamed: 0,date,identifier,market_cap,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,...,sector_15,sector_20,sector_25,sector_30,sector_35,sector_40,sector_45,sector_50,sector_55,target
0,2010-01-06,AD41WBQFVG43,3.007576,0.925858,-0.050894,1.16042,0.006546,-1.059081,-0.878123,0.882487,...,0,0,0,0,1,0,0,0,0,-6e-06
1,2010-01-13,AD41WBQFVG43,2.321416,1.411373,-4.690735,-0.520816,0.287853,0.509281,1.29656,0.77676,...,0,0,0,0,1,0,0,0,0,-0.008721
2,2010-01-20,AD41WBQFVG43,1.44618,-0.075375,-0.067435,0.32935,-0.412746,1.069342,-1.671132,0.413049,...,0,0,0,0,1,0,0,0,0,-8.1e-05
3,2010-01-27,AD41WBQFVG43,2.41597,-0.804543,-0.051247,0.084116,0.637542,0.160672,-1.191276,-1.507523,...,0,0,0,0,1,0,0,0,0,-1.8e-05
4,2010-02-03,AD41WBQFVG43,1.037958,-0.256449,-0.004505,1.567826,0.450347,1.570282,-0.251137,0.879335,...,0,0,0,0,1,0,0,0,0,0.000318
5,2010-02-10,AD41WBQFVG43,1.386004,0.729715,2.574254,-0.496107,0.045796,-0.543827,-0.881531,0.879288,...,0,0,0,0,1,0,0,0,0,0.003916
6,2010-02-17,AD41WBQFVG43,2.149381,0.322911,-0.042323,1.499064,0.479223,1.292799,0.466489,-1.744327,...,0,0,0,0,1,0,0,0,0,1.1e-05
7,2010-02-24,AD41WBQFVG43,2.304377,1.520095,-0.044594,1.25364,-1.548394,-0.365141,-1.301593,-0.188008,...,0,0,0,0,1,0,0,0,0,-2e-06
8,2010-03-03,AD41WBQFVG43,1.946646,1.074697,-0.090617,-0.15708,-1.620671,0.636171,-0.332881,-0.290542,...,0,0,0,0,1,0,0,0,0,-0.000101
9,2010-03-10,AD41WBQFVG43,1.928016,1.56204,-0.074536,-0.851678,-0.263748,-0.590885,0.073757,-0.529902,...,0,0,0,0,1,0,0,0,0,-0.001119


In [844]:
valid_scaled[0]

Unnamed: 0,date,identifier,market_cap,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,...,sector_15,sector_20,sector_25,sector_30,sector_35,sector_40,sector_45,sector_50,sector_55,target
51,2010-12-29,AD41WBQFVG43,2.310431,-1.303987,-0.079472,0.588921,-0.448953,-0.366864,0.509942,-1.3009,...,0,0,0,0,1,0,0,0,0,-0.0006639602
52,2011-01-05,AD41WBQFVG43,1.892309,-0.677529,-0.043933,0.044272,1.480448,1.649117,0.024248,0.770314,...,0,0,0,0,1,0,0,0,0,1.317891e-06
53,2011-01-12,AD41WBQFVG43,1.005541,0.839178,-0.046598,-1.268706,-1.089714,-0.305079,1.599391,0.881859,...,0,0,0,0,1,0,0,0,0,-3.249597e-05
54,2011-01-19,AD41WBQFVG43,2.237948,-0.631777,0.216065,0.014168,0.618526,-0.15869,-1.333573,-0.623537,...,0,0,0,0,1,0,0,0,0,3.036537e-07
55,2011-01-26,AD41WBQFVG43,2.271114,1.655925,-0.044978,-1.554503,-0.404005,0.271607,-0.285569,0.9449,...,0,0,0,0,1,0,0,0,0,-3.491265e-06
56,2011-02-02,AD41WBQFVG43,1.991037,0.506954,0.703828,-0.575364,-1.631761,0.960918,0.005907,-0.568116,...,0,0,0,0,1,0,0,0,0,0.002191665
57,2011-02-09,AD41WBQFVG43,1.648867,0.216122,-0.040549,1.581574,1.603893,0.325802,1.43205,-0.78506,...,0,0,0,0,1,0,0,0,0,0.0001092242
58,2011-02-16,AD41WBQFVG43,2.418853,-0.350197,-0.256248,-1.048351,1.659639,-1.681391,0.132817,1.546457,...,0,0,0,0,1,0,0,0,0,-0.0007258031
59,2011-02-23,AD41WBQFVG43,2.230132,-0.896564,-0.057832,-1.481678,1.379868,0.597791,1.656875,0.129176,...,0,0,0,0,1,0,0,0,0,-4.351525e-05
60,2011-03-02,AD41WBQFVG43,1.735189,1.61598,-0.22623,-0.529646,1.354722,-0.460746,-0.303834,-1.043137,...,0,0,0,0,1,0,0,0,0,-0.0009517499


In [845]:
test_scaled[0]

Unnamed: 0,date,identifier,market_cap,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,...,sector_15,sector_20,sector_25,sector_30,sector_35,sector_40,sector_45,sector_50,sector_55,target
77,2011-06-29,AD41WBQFVG43,3.098548,-0.181993,0.216015,-1.384515,0.117635,0.315612,1.416154,-1.44645,...,0,0,0,0,1,0,0,0,0,0.000338
78,2011-07-06,AD41WBQFVG43,3.577558,-0.990169,-0.062662,0.55713,0.344843,1.016627,1.338619,-0.336271,...,0,0,0,0,1,0,0,0,0,-0.003415
79,2011-07-13,AD41WBQFVG43,1.447854,1.689364,-0.057127,-1.71433,1.608545,0.694836,-1.247388,1.652065,...,0,0,0,0,1,0,0,0,0,-0.000197
80,2011-07-20,AD41WBQFVG43,2.210881,-0.966797,-1.095929,-0.034127,-0.179827,-0.709755,1.524395,0.042027,...,0,0,0,0,1,0,0,0,0,-0.007665
81,2011-07-27,AD41WBQFVG43,2.185459,0.992205,-0.578275,-1.555547,0.335122,1.083791,-0.222758,0.716265,...,0,0,0,0,1,0,0,0,0,-0.00287
82,2011-08-03,AD41WBQFVG43,2.376408,-0.142333,-0.092574,-0.039059,0.44884,0.969196,0.732377,0.897976,...,0,0,0,0,1,0,0,0,0,-0.000501
83,2011-08-10,AD41WBQFVG43,1.354858,0.353962,-1.267636,0.72335,0.317036,1.275531,0.992831,1.431564,...,0,0,0,0,1,0,0,0,0,-0.011965
84,2011-08-17,AD41WBQFVG43,2.346807,0.418317,-1.404689,1.537948,-0.395326,-0.382871,0.228454,-1.191645,...,0,0,0,0,1,0,0,0,0,-0.007264
85,2011-08-24,AD41WBQFVG43,1.376782,1.611016,-0.398107,1.092565,1.346624,1.245592,1.522509,-1.252009,...,0,0,0,0,1,0,0,0,0,-0.001161
86,2011-08-31,AD41WBQFVG43,2.813165,0.771533,-0.164555,0.983778,-1.39204,-0.463302,0.818201,0.202185,...,0,0,0,0,1,0,0,0,0,-0.000519


In [846]:
train_scaled[0].columns

Index(['date', 'identifier', 'market_cap', 'factor_1', 'factor_2', 'factor_3',
       'factor_4', 'factor_5', 'factor_6', 'factor_7', 'factor_8', 'factor_9',
       'factor_10', 'sector_10', 'sector_15', 'sector_20', 'sector_25',
       'sector_30', 'sector_35', 'sector_40', 'sector_45', 'sector_50',
       'sector_55', 'target'],
      dtype='object')

In [847]:
train_scaled[322].columns

Index(['date', 'identifier', 'market_cap', 'factor_1', 'factor_2', 'factor_3',
       'factor_4', 'factor_5', 'factor_6', 'factor_7', 'factor_8', 'factor_9',
       'factor_10', 'sector_10', 'sector_15', 'sector_20', 'sector_25',
       'sector_30', 'sector_35', 'sector_40', 'sector_45', 'sector_50',
       'sector_55', 'target'],
      dtype='object')

In [848]:
def drop_final_columns(data_dict):
  for i in range(0, len(data_dict)):
    data_dict[i].drop(columns = ['date', 'identifier'], inplace = True)
  return data_dict

In [849]:
train_scaled, valid_scaled, test_scaled = \
drop_final_columns(train_scaled), drop_final_columns(valid_scaled), drop_final_columns(test_scaled)

In [850]:
def plot_model_perf(history, metric = None):
  plt.figure(figsize=(12,8))
  epochs = range(len(history.history['loss']))
  plt.plot( epochs, history.history['loss'],color = 'blue', label = 'loss' )
  plt.plot(epochs, history.history['val_loss'], color = 'red', label = 'val_loss' )
  if metric:
    plt.plot( epochs, history.history[f'{metric}'],color = 'green', label = f'{metric}' )
    plt.plot(epochs, history.history[f'val_{metric}'], color = 'orange', label = f'val_{metric}' )
    plt.legend()
    plt.show()
  else:
    plt.legend()
    plt.show()

In [860]:
def convert_to_tensor(data_dict, window_size = g.window_size, batch_size = g.batch_size, shuffle_buffer = 1000):
    dataset = tf.data.Dataset.from_tensor_slices(np.asarray(data_dict[0], dtype= np.float32))
    dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))
    dataset = dataset.map(lambda window: (window[:-1], window[-1,-1]))
    for i in range(1, len(data_dict)):
      dataset_tmp = tf.data.Dataset.from_tensor_slices(np.asarray(data_dict[i], dtype= np.float32))
      dataset_tmp = dataset_tmp.window(window_size + 1, shift=1, drop_remainder=True)
      dataset_tmp = dataset_tmp.flat_map(lambda window: window.batch(window_size + 1))
      dataset_tmp = dataset_tmp.map(lambda window: (window[:-1], window[-1,-1]))
      dataset = dataset.concatenate(dataset_tmp)
    dataset = dataset.shuffle(shuffle_buffer)
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset

In [861]:
train = convert_to_tensor(train_scaled)
valid = convert_to_tensor(valid_scaled)
test = convert_to_tensor(test_scaled)

In [863]:
tf.keras.backend.clear_session()
def create_time_model(filter_units = 32, lstm_units = 128, kernel_size = 5, dr = .2 ):

    input_ = Input(shape=(None,22))
    x = Conv1D(filters = filter_units, kernel_size = kernel_size, strides = 1,
                                                              kernel_initializer='lecun_normal', padding = 'causal')(input_)
    x = BatchNormalization()(x)
    x = Activation('elu')(x)
    x = Dropout(dr)(x)                                                       
    x = LSTM(lstm_units, return_sequences = True, dropout = dr)(x)
    x = LSTM(lstm_units, return_sequences = True, dropout = dr)(x)
    #x = LSTM(lstm_units, return_sequences = True, dropout = dr, recurrent_dropout = dr)(x)
    out = TimeDistributed(Dense(1, kernel_initializer="he_normal"))(x)
    out_L = Lambda(lambda y: y * 1e4)(out)
    model = Model(inputs = input_, outputs = out)

    return model
model = create_time_model()
mape = tf.keras.losses.MeanAbsolutePercentageError()
model.compile(loss = mape, optimizer = 'adam', metrics = 'mae')
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, 22)]        0         
                                                                 
 conv1d (Conv1D)             (None, None, 32)          3552      
                                                                 
 batch_normalization (BatchN  (None, None, 32)         128       
 ormalization)                                                   
                                                                 
 activation (Activation)     (None, None, 32)          0         
                                                                 
 dropout (Dropout)           (None, None, 32)          0         
                                                                 
 lstm (LSTM)                 (None, None, 128)         82432     
                                                             

In [864]:
early_stopping = EarlyStopping(patience = 5, restore_best_weights= True)
history = model.fit(train, epochs = 100, validation_data = valid, callbacks = None)
plot_model_perf(history, 'mae')
model.evaluate(valid)

Epoch 1/100
Epoch 2/100
112/913 [==>...........................] - ETA: 1:07 - loss: 38033.8672 - mae: 0.0036

KeyboardInterrupt: ignored