# Testing Data Loading from SSD
I'm specifically testing 1) whether it's possible, and 2) how fast it is, especially compared with loading data from my local computer.

## Imports

In [44]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import heatmap
import dask
import dask.dataframe as dd
from dask import delayed
from pyarrow.parquet import ParquetFile
import pyarrow as pa
from tqdm import tqdm

import tulipy as ti

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import IncrementalPCA

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras import initializers
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import register_keras_serializable
from tensorflow.keras.optimizers import SGD
import keras_tuner as kt
from keras_tuner import HyperParameters

import os
import sys
import warnings

## Trying to get data from SSD
We have our filtered PM 1-min data stored in an SSD, and we're now going to try accessing it.

In [8]:
# Getting the current directory
os.getcwd()

'/Users/danielwang/Desktop/Work stuff/Coding Stuff/Day Trading Stuff/Trading-Strategies-With-ML/Archive'

In [66]:
# Getting the path to the SSD
ssd_name = 'T7'
ssd_path = '/Volumes/' + ssd_name + '/'
filtered_parquet_PM_path = ssd_path + 'filtered-parquet-PM/'
filtered_parquet_PM_HL_path = ssd_path + 'filtered-parquet-PM-HL/'

# Changing the directory to the filtered-parquet-PM folders
os.chdir(filtered_parquet_PM_path)
os.getcwd()

'/Volumes/T7/filtered-parquet-PM'

In [69]:
filtered_parquet_PM_path_2 = ssd_path + 'filtered-parquet-PM 2/'
filtered_parquet_PM_path_3 = ssd_path + 'filtered-parquet-PM 3/'

In [70]:
len(os.listdir(filtered_parquet_PM_path_2))

2990

In [77]:
len(os.listdir(filtered_parquet_PM_path_3))

1188

In [78]:
os.chdir(filtered_parquet_PM_path_3)
len(os.listdir('AAPL_1min_parquet'))

8552

In [43]:
# Getting how many folders with tickers there are
len(os.listdir(filtered_parquet_PM_HL_path))

4146

In [46]:
# Testing the data with NTLA
NTLA_parq_folder = 'NTLA_1min_parquet'
len(os.listdir(NTLA_parq_folder))

22

In [53]:
# Testing with dask NTLA filtered PM parquet data
df_NTLA = dd.read_parquet(NTLA_parq_folder)


# This part is optional (if you want to choose a specific datetime range)
# ^Note, the time range you chose MUST have data in it for this to work
# df_NTLA = df_NTLA.loc['2024-07-15 00:00':'2024-07-30 00:00']
# df_NTLA = df_NTLA.loc['2024-12-01 00:00':'2024-12-30 00:00']

# Need to do the following to repartition properly
df_NTLA = df_NTLA.reset_index()
df_NTLA['timestamp'] = df_NTLA['timestamp'].dt.floor('s')
df_NTLA = df_NTLA.set_index('timestamp')

# # Repartioning for 5 days because data is pretty sparse
df_NTLA = df_NTLA.repartition(freq='5D')

df_NTLA

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ticker
npartitions=516,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-11-02 04:03:00,float64,float64,float64,float64,float64,string
2017-11-05 00:00:00,...,...,...,...,...,...
...,...,...,...,...,...,...
2024-11-18 00:00:00,...,...,...,...,...,...
2024-11-18 15:59:00,...,...,...,...,...,...


In [65]:
df_NTLA.partitions[250:300].compute()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ticker
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-06-28 04:06:00,130.00,130.0000,130.00,130.0000,295.0,NTLA
2021-06-28 04:07:00,125.00,125.0000,123.99,123.9900,238.0,NTLA
2021-06-28 04:08:00,123.99,123.9900,123.99,123.9900,0.0,NTLA
2021-06-28 04:09:00,121.99,122.0000,121.99,122.0000,200.0,NTLA
2021-06-28 04:10:00,122.00,122.0000,122.00,122.0000,0.0,NTLA
...,...,...,...,...,...,...
2021-06-30 15:55:00,163.65,163.9399,162.83,163.0200,37309.0,NTLA
2021-06-30 15:56:00,163.21,163.4000,161.58,161.9499,58386.0,NTLA
2021-06-30 15:57:00,161.91,162.2800,161.56,162.0700,80103.0,NTLA
2021-06-30 15:58:00,162.11,162.5900,161.99,162.4199,63782.0,NTLA


In [48]:
df_NTLA = pd.read_parquet(NTLA_parq_folder)
pd.DataFrame(df_NTLA.index.date).value_counts()

2024-11-18    720
2020-06-03    718
2017-11-02    717
2021-06-28    714
2020-12-02    710
2021-06-30    705
2022-12-01    623
2024-10-24    612
2022-08-03    563
2023-11-01    540
2022-09-16    536
Name: count, dtype: int64