# Importing Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
import time
import dask
import dask.dataframe as dd
from dask import delayed

from pyarrow.parquet import ParquetFile
import pyarrow as pa
from tqdm import tqdm
import os, sys
import shutil

# Parquet Google Drive Loading Testing

In [3]:
# Getting the Google Drive path after downloading Google Drive API to laptop
GDrive_path_AAPL = '/Users/danielwang/Library/CloudStorage/GoogleDrive-daniel.wang730@gmail.com/.shortcut-targets-by-id/1-hyRREV8QnovDOIb88U0fI9seA-_K8kJ/Backtesting/FirstRateData/Stocks/filtered/filtered-parquet/AAPL_1min_parquet/2024-07-15_AAPL_1min.parquet'

In [4]:
df = pd.read_parquet(GDrive_path_AAPL)
df.head()

Unnamed: 0,Date,Datetime,Open,High,Low,Close,Volume,Ticker
3737555,2024-07-15,2024-07-15 04:00:00,232.22,233.26,232.22,233.26,17697,AAPL
3737556,2024-07-15,2024-07-15 04:01:00,233.25,233.75,233.25,233.55,7151,AAPL
3737557,2024-07-15,2024-07-15 04:02:00,233.5,234.0,233.5,233.81,6579,AAPL
3737558,2024-07-15,2024-07-15 04:03:00,233.71,233.89,233.61,233.7,6007,AAPL
3737559,2024-07-15,2024-07-15 04:04:00,233.69,233.7,232.0,233.44,6335,AAPL


In [5]:
GDrive_path_sample = '/Users/danielwang/Library/CloudStorage/GoogleDrive-daniel.wang730@gmail.com/.shortcut-targets-by-id/1-hyRREV8QnovDOIb88U0fI9seA-_K8kJ/Backtesting/FirstRateData/Stocks/filtered/filtered-parquet/TSLA_1min_parquet/2024-07-15_TSLA_1min.parquet'

df = pd.read_parquet(GDrive_path_sample)
df.head()

Unnamed: 0,Date,Datetime,Open,High,Low,Close,Volume,Ticker
2065219,2024-07-15,2024-07-15 04:00:00,260.51,262.14,255.88,261.99,55086,TSLA
2065220,2024-07-15,2024-07-15 04:01:00,261.55,262.05,261.06,262.0,22543,TSLA
2065221,2024-07-15,2024-07-15 04:02:00,261.92,262.01,260.79,260.96,25075,TSLA
2065222,2024-07-15,2024-07-15 04:03:00,260.78,260.85,260.0,260.83,15186,TSLA
2065223,2024-07-15,2024-07-15 04:04:00,260.85,261.37,260.83,261.19,8221,TSLA


In [6]:
GDrive_path_sample = '/Users/danielwang/Library/CloudStorage/GoogleDrive-daniel.wang730@gmail.com/.shortcut-targets-by-id/1-hyRREV8QnovDOIb88U0fI9seA-_K8kJ/Backtesting/FirstRateData/Stocks/filtered/filtered-parquet/UAL_1min_parquet/2024-07-15_UAL_1min.parquet'

df = pd.read_parquet(GDrive_path_sample)
df.head()

Unnamed: 0,Date,Datetime,Open,High,Low,Close,Volume,Ticker
1532420,2024-07-15,2024-07-15 04:01:00,44.85,44.85,44.85,44.85,179,UAL
1532421,2024-07-15,2024-07-15 07:00:00,44.99,45.0,44.99,45.0,224,UAL
1532422,2024-07-15,2024-07-15 07:01:00,44.95,44.95,44.95,44.95,166,UAL
1532423,2024-07-15,2024-07-15 07:06:00,44.9,44.9,44.9,44.9,100,UAL
1532424,2024-07-15,2024-07-15 07:37:00,44.86,44.86,44.85,44.85,320,UAL


In [13]:
# Getting the paths to the trading colab folder and the backtesting files folder
backtesting_files_path = '/Users/danielwang/Library/CloudStorage/GoogleDrive-daniel.wang730@gmail.com/.shortcut-targets-by-id/1-hyRREV8QnovDOIb88U0fI9seA-_K8kJ/Backtesting/FirstRateData/Stocks/filtered/filtered-parquet'

In [14]:
# Changing the directory to the backtesting files
os.chdir(backtesting_files_path)

In [15]:
os.getcwd()

'/Users/danielwang/Library/CloudStorage/GoogleDrive-daniel.wang730@gmail.com/.shortcut-targets-by-id/1-hyRREV8QnovDOIb88U0fI9seA-_K8kJ/Backtesting/FirstRateData/Stocks/filtered/filtered-parquet'

# Getting even more filtered stocks with more stringent PM conditions (called fil_v1) (should only need to do once)
- Added 50,000 minimum PM shares requirement
- Added a condition that the timestamp of the first row must be at the latest 8am (beginning of PM for most stocks); this is primarily to make sure that we filter out stocks that don't have PM data, but also making sure there's enough data in the PM for it to be valuable

(Btw, we technically only need to do this step once, since once we have this data, we're good)

In [12]:
# Next steps:
# Figure out conditions for pre-market testing (basically Done)
# - Pre-market gappers, etc. (but make it broad enough) (basically Done)
#    - At least 50,000 shares in the PM (Done)
#    - Other conditions (avg vol, atr, and price threshold) are included already
# - Also make sure to only keep the stocks that have data at 4am/8am (Done)
# Create a script to filter for stocks that meet those conditions (Done)
#
# Find PM high and low (Done)
# Change to 100,000 shares in the PM (Done)
#
# Open is outside the daily range (from previous day) (add this in separately)
#
# Could also create a dictionary or list or a csv file to write to that has the
# exact files (and tickers) that should be loaded in
#
# Keep in mind that the GOAL is to just get tickers where the PM conditions are
# satisfied, and then ultimately combine those stocks and trade them

In [37]:
# Remove .DS_Store if it exists
try: os.remove('.DS_Store')
except: pass

In [38]:
# Getting the tickers of all tickers in filtered-parquet folder

# Make sure you're in Google Colab and make sure your current working directory
# (can use os.getcwd() to get it) is where the filtered-parquet folder is
filtered_parquet_folders = os.listdir()
filtered_parquet_folders.sort()

# Getting just the names of the tickers
split_folder_names = []
for folder_name in filtered_parquet_folders:
  split_folder_names.append(folder_name.split('_')[0])

print("Total number of unique folders:", len(set(filtered_parquet_folders)))
print("Total number of unique tickers:", len(set(split_folder_names)))

Total number of unique folders: 3148
Total number of unique tickers: 3148


In [39]:
test_folder_names = filtered_parquet_folders[:2]
test_folder_names

['AADI_1min_parquet', 'AAL_1min_parquet']

In [40]:
# Getting dictionary of folders connected to list of all eligible parquet files
# (Will call this "filtered version 1" or fil_v1)
fil_v1_dict = {}
fil_PM_high_low = {}

# Running the loop to fill the dictionary above
for ticker_folder_path in tqdm(test_folder_names):

  # Just getting some timing
  start_time = time.time()

  # Initializing the dictionary entry for this ticker folder
  ticker_fil_v1_parquet_list = []

  # Getting a list of parquets with their dates
  list_parquet_w_dates = os.listdir(ticker_folder_path)

  # Getting a parquet of a particular date for that ticker
  for parquet_w_date in tqdm(list_parquet_w_dates):

    # Getting the name of the parquet path for this ticker so we can access it
    ticker_parquet_path = ticker_folder_path + '/' + parquet_w_date

    # Read the parquet into a dataframe
    df_test = pd.read_parquet(ticker_parquet_path)

    # The following code is technically faster, but ParquetFile() bugs out for me quite often for some reason
    # pf = ParquetFile(ticker_parquet_path)
    # batch_size = 1  # Number of rows that you want
    # first_row = next(pf.iter_batches(batch_size = batch_size))
    # df_test = pa.Table.from_batches([first_row]).to_pandas()

    # Checking to see if the first timestamp is at the latest 8am
    # ^8am because that's when PM trading for pretty much ALL stocks begins
    # ^Some bigger stocks begin trading at 4am, but that'll be included too
    data_time_threshold = pd.to_datetime('2000-01-01 08:00').time()

    if df_test['Datetime'].iloc[0].time() > data_time_threshold:
      continue  # Skip this date if the above condition is not met

    # Getting only up to 9:30am for the parquet files for testing
    open_time = pd.to_datetime('2000-01-01 09:30').time()
    df_PM = df_test[df_test['Datetime'].dt.time < open_time]

    # If PM volume is not greater than 100_000 shares, then skip
    if df_PM['Volume'].sum() < 100_000:
      continue

    # Getting the max and min of PM
    PM_high = df_PM['High'].max()
    PM_low = df_PM['Low'].min()
    fil_PM_high_low[parquet_w_date] = [PM_high, PM_low]

    # Append this parquet date to the list of acceptable filtered parquet dates
    ticker_fil_v1_parquet_list.append(parquet_w_date)

  fil_v1_dict[ticker_folder_path] = ticker_fil_v1_parquet_list
  print("Finished with ticker folder:", ticker_folder_path)
  print(" - Total number of dates for this ticker:", len(list_parquet_w_dates))
  print(" - Time it took:", time.time() - start_time, '\n')


  0%|                                                     | 0/2 [00:00<?, ?it/s]
  0%|                                                     | 0/7 [00:00<?, ?it/s][A
 14%|██████▍                                      | 1/7 [00:00<00:02,  2.41it/s][A
 29%|████████████▊                                | 2/7 [00:00<00:02,  2.30it/s][A
 43%|███████████████████▎                         | 3/7 [00:01<00:01,  2.77it/s][A
 57%|█████████████████████████▋                   | 4/7 [00:01<00:01,  2.56it/s][A
 71%|████████████████████████████████▏            | 5/7 [00:02<00:00,  2.38it/s][A
 86%|██████████████████████████████████████▌      | 6/7 [00:02<00:00,  2.48it/s][A
100%|█████████████████████████████████████████████| 7/7 [00:02<00:00,  2.50it/s][A
 50%|██████████████████████▌                      | 1/2 [00:03<00:03,  3.06s/it]

Finished with ticker folder: AADI_1min_parquet
 - Total number of dates for this ticker: 7
 - Time it took: 3.0555148124694824 




  0%|                                                  | 0/2279 [00:00<?, ?it/s][A
  0%|                                          | 1/2279 [00:00<14:03,  2.70it/s][A
  0%|                                          | 2/2279 [00:00<17:17,  2.20it/s][A
  0%|                                          | 3/2279 [00:01<16:13,  2.34it/s][A
  0%|                                          | 4/2279 [00:01<16:19,  2.32it/s][A
  0%|                                          | 5/2279 [00:02<14:58,  2.53it/s][A
  0%|                                          | 6/2279 [00:02<14:59,  2.53it/s][A
  0%|▏                                         | 7/2279 [00:02<16:00,  2.36it/s][A
  0%|▏                                         | 8/2279 [00:03<18:20,  2.06it/s][A
  0%|▏                                         | 9/2279 [00:03<17:14,  2.19it/s][A
  0%|▏                                        | 10/2279 [00:04<17:24,  2.17it/s][A
  0%|▏                                        | 11/2279 [00:04<17:37,  2.14

KeyboardInterrupt: 

In [41]:
# Cannot do on personal mac because it takes up too much space