# Data Wrangling – FFT-FFN Vs. TS-LSTM

This notebook is for testing & developing the dataset that will be passed as input to the FFT-FFN & TS-LSTM models, it needs to have:
- Windows as Input Vectors (4 weeks worth of market data condensed into 1 "Window" that is the input vector)
  
**Notes**
- Samples or Windows CAN overlap! (`1, 2, 3 = Window 1` -- `2, 3, 4 = Window 2`)

In [1]:
# Will Prototype Methods on Stock Market Data
import pandas as pd 

df = pd.read_csv("../Data Quality (Stocks)/Stocks_Cleaned.csv")
df.rename(columns={"Name":"ticker"}, inplace=True)
df["date"] = [pd.Timestamp(x) for x in df["date"]]

# Subset Data for 1 Ticker
aapl = df[df["ticker"] == "AAPL"]

df

Unnamed: 0,date,open,high,low,close,volume,ticker
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.10,14.27,8126000,AAL
3,2013-02-13,14.30,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL
...,...,...,...,...,...,...,...
658515,2018-02-01,76.84,78.27,76.69,77.82,2982259,ZTS
658516,2018-02-02,77.53,78.12,76.73,76.78,2595187,ZTS
658517,2018-02-05,76.64,76.92,73.18,73.83,2962031,ZTS
658518,2018-02-06,72.74,74.56,72.13,73.27,4924323,ZTS


In [4]:
aapl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1304 entries, 1304 to 2607
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1304 non-null   datetime64[ns]
 1   open    1304 non-null   float64       
 2   high    1304 non-null   float64       
 3   low     1304 non-null   float64       
 4   close   1304 non-null   float64       
 5   volume  1304 non-null   int64         
 6   ticker  1304 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 81.5+ KB


In [3]:
print(df["date"].min(), df["date"].max())

2013-02-08 2018-02-07


In [5]:
df["ticker"].nunique() #505 tickers in data

505

#### Check Missing Trading Data Dates 

- Not pertinent for the scope of this project -- impute prior day val instead

In [38]:
# Check num of Days in Range of Data
from datetime import datetime

def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d")
    d2 = datetime.strptime(d2, "%Y-%m-%d")
    diff = abs((d2 - d1).days)
    print(diff)
    return diff


days_between("2018-02-07", "2013-02-08") #1825 days between

1825/5 #5 years of data (365 days per year)

1825


365.0

In [8]:
# Get Number of Weeks Between Dates

def diff(start, end):
    days = pd.to_datetime(end) - pd.to_datetime(start)
    week = int(pd.Timedelta(days).days / 7)
    remainder = pd.Timedelta(days).days % 7
    return str(week) + ' weeks and ' + str(remainder) + ' days'


print(diff(aapl["date"].min(), aapl["date"].max()))

260 weeks and 5 days


In [25]:
1825 * (5/7) #Business Days only in dataset ~1304 days

# Matches the Number of days in a Single Stock's Data
aapl #1304 Business Days (subsetted df)

Unnamed: 0,date,open,high,low,close,volume,ticker
1304,2013-02-08,67.7142,68.4014,66.8928,67.8542,158168416,AAPL
1305,2013-02-11,68.0714,69.2771,67.6071,68.5614,129029425,AAPL
1306,2013-02-12,68.5014,68.9114,66.8205,66.8428,151829363,AAPL
1307,2013-02-13,66.7442,67.6628,66.1742,66.7156,118721995,AAPL
1308,2013-02-14,66.3599,67.3771,66.2885,66.6556,88809154,AAPL
...,...,...,...,...,...,...,...
2603,2018-02-01,167.1650,168.6200,166.7600,167.7800,47230787,AAPL
2604,2018-02-02,166.0000,166.8000,160.1000,160.5000,86593825,AAPL
2605,2018-02-05,159.1000,163.8800,156.0000,156.4900,72738522,AAPL
2606,2018-02-06,154.8300,163.7200,154.0000,163.0300,68243838,AAPL


In [41]:
# Check for 0 Trading Volume Days
aapl[aapl["volume"] <= 1].shape #45 days with zeroes across board -- impute previous?

(45, 7)

In [62]:
aapl[aapl["date"] == "2013-02-15"]

Unnamed: 0,date,open,high,low,close,volume,ticker
1309,2013-02-15,66.9785,67.1656,65.7028,65.7371,97924631,AAPL


In [None]:
# Get Missing Dates & Check other values
from datetime import datetime, timedelta

missing_dates = []

for i, row in aapl.iterrows():
    if row["volume"] == 0:
        missing_dates.append(row.iloc[0])
        #print(row.values) #Example- ['2013-02-18' 0.0 0.0 0.0 0.0 0 'AAPL']

len(missing_dates)


# View Missing Dates & Row Prior 
for date in missing_dates:
    day_before = date - timedelta(days=1)
    split_df = aapl[aapl["date"] == day_before]
    print(day_before)
    print(split_df["volume"])

## Create Windowed Samples

- Each windowed sample contains 20 datapoints that we will pass into the LSTM to predict the next single value (output is a scalar for next value in sequence)

In [13]:
print(X.shape, y.shape)

(1284, 20) (1284,)


In [26]:
X[:1200], y[:1200].shape
X[1200:], y[1200:].shape


(1200, 20)

In [49]:
# Reference Article- https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/
from numpy import array
import numpy as np 

def split_sequence(sequence, n_steps):
	X, y = list(), list()
	for i in range(len(sequence)):
		# find the end of this pattern
		end_ix = i + n_steps
		# check if we are beyond the sequence
		if end_ix > len(sequence)-1:
			break
		# Define Input sequence and Output (being data and label)
		seq_x, seq_y = sequence[i:end_ix], sequence[end_ix] #up to the last value in the sequence
		X.append(seq_x)
		y.append(seq_y)
	return array(X), array(y) #np.arrays


# Input Sequence for Windowing -- df is sorted already, so values are in increasing time order (can take last few values as test set)
#input_sequence = aapl["high"].to_list()
input_sequence = aapl["volume"].to_list() #currently set to Volume


# N-Time Steps per Window
n_days = 20 #20 days = 4 weeks if 1 month windows (of business days)


# Split Sequence into Windows
X, y = split_sequence(input_sequence, n_days) #gets data in arrays


# Fill All Zeroes with Prior Value
while True:
	I = np.nonzero(X==0)[0]
	if len(I)==0: break
	X[I] = X[I-1]


# Reshape Dims of Data for LSTM & Split into Train/Test Sets
test_set_size = len(X) - 284 #specify test set size and split accordingly
X = X.reshape((len(X), n_days, 1)) #reshapes to- (1284, 20, 1) = [samples, timesteps_per_sample, n_features in timestep]
X_train, y_train = X[:test_set_size], y[:test_set_size]
X_test, y_test =  X[test_set_size:], y[test_set_size:]

####-- NEED GET RANDOM SAMPLES FROM TRAINING DATA (4 WEEK DURATION SAMPLES) --####
#the sequence of when we pass in training samples does not matter, can pass in at random (and may be a benefit to this "dataloader" like method)

# View Arrays & Values -- These windows get passed in as input to LSTM!
#for i in range(len(X)):
#	print(X[i], y[i], "\n")


# Print dims of Train & Test Sets
print("Train - Data-{} Labels-{}".format(X_train.shape, y_train.shape))
print(" Test - Data-{} Labels-{}".format(X_test.shape, y_test.shape))

Train - Data-(1000, 20, 1) Labels-(1000,)
 Test - Data-(284, 20, 1) Labels-(284,)


In [None]:
# Investigate Reshaping of y for MinMaxScaler 
a = np.array([0, 0, 0, 0, 0])
print(a.shape)

print(a)
a.reshape(-1, 1)

In [None]:
# Testing out Methods to Replace zero values in X
import numpy as np 


np.where(X==0, X[15], X)[0]

# Test out on new array -- no go
xt = X
z = np.nonzero(xt==0.)[0]
xt[z] = xt[z-1]

# Works!
while True:
    I=np.nonzero(X==0)[0]
    if len(I)==0: break
    X[I] = X[I-1]

# View Arrays & Values
for i in range(len(xt)):
	print(xt[i], y[i], "\n")

## Experimenting with Randomized Windowed Samples

- Need get arbitrary number of data samples, can overlap between sequences?

In [1]:
from StockDataForLSTM import * 

In [2]:
X_train, y_train, X_test, y_test, _ = readData(data_path="../Data Quality (Stocks)/Stocks_Cleaned.csv", n_days=20, column="volume", ticker="AAPL", normalize=True)

Train - Data-(4716, 1, 20) Labels-(4716, 1)
 Test - Data-(284, 1, 20) Labels-(284, 1)


In [5]:
# Get Output Dimensions for Each new Var

print([i.shape for i in [X_train, y_train, X_test, y_test]])


[(4716, 1, 20), (4716, 1), (284, 1, 20), (284, 1)]


In [8]:
X_train[0]

array([[0.09593431, 0.12553019, 0.2442769 , 0.4816854 , 0.37286677,
        0.16944198, 0.10705677, 0.17842132, 0.363093  , 0.38038193,
        0.14632266, 0.12163395, 0.11262926, 0.08477969, 0.14522489,
        0.2332076 , 0.49679499, 0.71851163, 0.43125902, 0.40370132]])