# CNN Model

In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Normalization
%matplotlib inline

In [2]:
UBS = pd.read_csv("UBS_Cleaned_Date.csv")
UBS.date = pd.to_datetime(UBS.date)
UBS = UBS.set_index("date")

In [3]:
UBS = UBS.iloc[14:, :]
UBS.shape

(797, 168)

In [4]:
pd.DataFrame(UBS.isna().sum()).T
# we see that the gain, average gain, average loss and loss columns are causing large numbers of NA's
# will need to come back and solve this issue.

Unnamed: 0,UBS_x,UBS Financial Services Inc.,UBS Investment Bank,UBS Global Wealth Management,UBS Asset Management,Open,High,Low,Close,Volume,...,Dow_Disparity_Move,Dow_Disparity_s_Move,Dow_RSI_Move,target_1,target_2,target_3,target_4,target_5,Stoch_Oscillator_3,Stoch_Oscillator_14
0,18,18,18,18,18,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3


In [5]:
# Remove any infinitly large or small values
# UBS.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop remanining NaN or null values
#UBS = UBS.dropna()
#UBS.shape


In [6]:
UBS.head()

Unnamed: 0_level_0,UBS_x,UBS Financial Services Inc.,UBS Investment Bank,UBS Global Wealth Management,UBS Asset Management,Open,High,Low,Close,Volume,...,Dow_Disparity_Move,Dow_Disparity_s_Move,Dow_RSI_Move,target_1,target_2,target_3,target_4,target_5,Stoch_Oscillator_3,Stoch_Oscillator_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-23,7.916667,0.0,0.0,0.0,0.0,11.166141,11.170377,11.022116,11.056005,2790600.0,...,1,1,0,0,0,1,0,0,,
2019-01-24,11.791667,0.0,0.0,0.0,0.0,11.022117,11.123781,10.963659,11.072948,2265600.0,...,0,0,0,1,1,1,1,1,51.477476,
2019-01-25,5.416667,0.0,0.0,0.0,0.0,11.072949,11.161906,11.064478,11.098366,2884400.0,...,1,1,1,0,0,0,0,1,46.249037,
2019-01-28,12.333333,0.0,0.0,0.0,0.0,10.988229,11.089893,10.971285,11.047533,4429400.0,...,0,0,1,1,1,0,1,0,36.136846,44.62112
2019-01-29,1.541667,0.0,0.0,0.0,0.0,11.089893,11.140725,11.005172,11.022117,2517200.0,...,1,1,1,1,1,1,1,0,27.972038,36.785974


In [7]:
# UBS_1, Basic Varaibles,  
UBS_1 = UBS[UBS.columns.drop(list(UBS.filter(regex='Move')))]
UBS_1 = UBS_1[["target_1", "Volume", 
                "Wiki_total", "Google_total", 
                "nas_close", "dow_close",
                "Close", "nas_vol", 
                 "dow_vol", "Stock_diff",
                ]]
UBS_1.head()  

Unnamed: 0_level_0,target_1,Volume,Wiki_total,Google_total,nas_close,dow_close,Close,nas_vol,dow_vol,Stock_diff
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-23,0,2790600.0,591.916667,1810.0,7025.77002,24575.619141,11.056005,2274420000,318600000,0.093192
2019-01-24,1,2265600.0,514.791667,1667.0,7073.459961,24553.240234,11.072948,2400290000,320170000,0.016944
2019-01-25,0,2884400.0,493.416667,1440.0,7164.859863,24737.199219,11.098366,2440840000,376890000,0.025417
2019-01-28,1,4429400.0,493.333333,1548.0,7085.680176,24528.220703,11.047533,2435480000,347170000,-0.050833
2019-01-29,1,2517200.0,687.541667,1787.0,7028.290039,24579.960938,11.022117,2089690000,330870000,-0.025416


In [8]:
# for checking NaN values

# UBS_1[UBS_1['Volume'].isnull()]
# pd.DataFrame(Ford.loc["2019-12-31"]).T

In [9]:
pd.set_option('display.max_columns', None)

In [10]:
print(UBS_1.shape)
print(UBS_1.dropna().shape)

(797, 10)
(779, 10)


In [11]:
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps): 
    X, y = list(), list()
    for i in range(len(sequences)):
    # find the end of this pattern
        end_ix = i + n_steps
    # check if we are beyond the dataset
        if end_ix > len(sequences): break
    # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1] 
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [12]:
#splitting into training sets 
n = len(UBS_1)
train_f1 = UBS_1.loc["2019-1-1":'2021-6-30']
val_f1 = UBS_1.loc['2021-7-1':'2021-12-31']
test_f1 = UBS_1.loc['2022-1-1':]

num_features = UBS_1.shape[1]

In [13]:
#normalizing the data, may come back later to take normalization with moving averages to avoid allwoing
# the training set to have access to futre traing data
train_f1_mean = train_f1.mean()
train_f1_std = train_f1.std()

train_df = (train_f1 - train_f1_mean) / train_f1_std
val_df = (val_f1 - train_f1_mean) / train_f1_std
test_df = (test_f1 - train_f1_mean) / train_f1_std

In [14]:
T_input = 60  # see below -- just a number for now

In [15]:
model_f1 = keras.Sequential([
    layers.Conv1D(32, 7, activation="relu", input_shape = [32, 5, 9]),
    layers.Conv1D(32, 7, activation="relu"),
    layers.MaxPooling1D(),
    layers.Flatten(),
    layers.Dense(30, activation="relu"),
    layers.Dense(1),
])
model_f1.compile(optimizer="adam", loss='categorical_crossentropy',
                   metrics=["accuracy"])

ValueError: One of the dimensions in the output is <= 0 due to downsampling in conv1d. Consider increasing the input size. Received input shape [None, 32, 5, 9] which would produce output shape with a zero or negative value in a dimension.

In [None]:
# model_f1 data prep
X_train=train_df.drop("target_1", axis=1)
X_val=val_df.drop("target_1", axis=1)
X_test=test_df.drop("target_1", axis=1)

y_train=train_df["target_1"]
y_val=val_df["target_1"]
y_test=test_df["target_1"]


In [None]:

train_dataset = keras.preprocessing.timeseries_dataset_from_array(
    X_train, 
    y_train,  
    5
)
test_dataset = keras.preprocessing.timeseries_dataset_from_array(
    X_val, 
    y_val,  
    5
)

In [None]:
x_t = list(train_dataset)[0][0].numpy()
y_t = list(train_dataset)[0][1].numpy()
x_t

array([[[-0.50216695,  0.22066315, -0.88809277, ..., -0.81250273,
         -0.25732813, -0.70113182],
        [ 0.46875738,  0.33095479,  1.14319888, ..., -0.73529334,
         -0.24465995,  1.00666459],
        [-0.27861629,  0.16566131,  1.58814848, ..., -0.71041973,
          0.21300844,  1.00667742],
        [-0.58931943,  0.09222954, -0.50762862, ..., -0.71370758,
         -0.02679947, -0.86377808],
        [-0.91837761,  0.09856483, -0.07557611, ..., -0.92581719,
         -0.15832265,  0.35607742]],

       [[ 0.46875738,  0.33095479,  1.14319888, ..., -0.73529334,
         -0.24465995,  1.00666459],
        [-0.27861629,  0.16566131,  1.58814848, ..., -0.71041973,
          0.21300844,  1.00667742],
        [-0.58931943,  0.09222954, -0.50762862, ..., -0.71370758,
         -0.02679947, -0.86377808],
        [-0.91837761,  0.09856483, -0.07557611, ..., -0.92581719,
         -0.15832265,  0.35607742],
        [-0.93475513, -0.09264575, -1.08154911, ..., -0.64711011,
          0.37