In [None]:
# filter 7606 firstrate stocks and the the days meeting criteria: all based upon daily candles
# criteria
# $10 or above on all prices on a given day
# ATR $0.50 14 day moving average
# avg volume on a day of 500000 shares or above 
# ?if can do market cap on the day of 50M or above (?Finviz)
# will move parquet files of the stock universe and restrict to days that fulfill criteria 
# then use this universe and days for Stock in Play universe 

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime as dt
import shutil
import sys
import traceback

import requests


API_key = os.getenv("Polygon_IO_API_Key")


In [2]:
source_folder = "G:/My Drive/Backtesting/FirstRateData/Stocks/filtered/stocks1day-csv-analyzed/"
stats_folder ="G:/My Drive/Backtesting/FirstRateData/Stocks/filtered/"
destination_folder = "G:/My Drive/Backtesting/FirstRateData/Stocks/filtered/filtered-1day-csv/"

#sample_ticker = "PLTR"
vol_threshold = 500000
atr_threshold = 0.5
price_threshold = 10
market_cap_threshold = 50000000

In [47]:
# this cell code takes in 1min csv data and writes a new file with ATR, price and volume conditions in new columns 

start = dt.now()
num_zero_days_counter = 0 
num_error_counter = 0
num_zero_days_tickers = []
num_error_tickers = []

for filename in os.listdir(source_folder):

    try:
        filename_str = str(filename)
        ticker = filename_str.split('_')[0]
        #print(ticker)

        df = pd.read_csv(source_folder + '/' + filename_str)
        df['Datetime'] = pd.to_datetime(df['Datetime'])
        df.set_index('Datetime', inplace=True)

        # volume threshold
        pd.options.display.float_format = '{:.2f}'.format
        df['Volume_14d'] = df['Volume'].rolling(window=14).mean().fillna(0).astype(int)

        df['vol_condition'] = df['Volume_14d'] >= vol_threshold

        df['range'] = df['High'] - df['Low']
        df['ATR_14d'] = df['range'].rolling(window=14).mean().fillna(0)
        df['atr_condition'] = df['ATR_14d'] >= atr_threshold

        df['price_condition'] = (df['Open'] >= price_threshold) & (df['High'] >= price_threshold) & (df['Low'] >= price_threshold) & (df['Close'] >= price_threshold)
        df['all_conditions'] = df['vol_condition'] & df['atr_condition'] & df['price_condition']

        num_zero_days = len(df[df['all_conditions'] == True])

        if num_zero_days == 0:
            num_zero_days_counter += 1
            print("no days meet criteria: ", ticker)
            num_zero_days_tickers.append(ticker)

        df.to_csv(destination_folder + ticker + "_full_1day_UNADJUSTED" + "_filtered.csv")
    except Exception as e:
        print(e)
        print("error with " + ticker)
        num_error_counter += 1
        num_error_tickers.append(ticker)
        continue
num_zero_days_tickers_df = pd.DataFrame(num_zero_days_tickers)
num_error_tickers_df = pd.DataFrame(num_error_tickers)
num_zero_days_tickers_df.to_csv(stats_folder + "num_zero_days_tickers.csv")
num_error_tickers_df.to_csv(stats_folder + "num_error_tickers.csv")
end = dt.now()
print(start)
print(end)
print(end-start)
print("num_zero_days_counter: ", num_zero_days_counter)

no days meet criteria:  AHT.G
no days meet criteria:  ANG.A
no days meet criteria:  AMCR
no days meet criteria:  ANEB
no days meet criteria:  ALX
no days meet criteria:  ALL.H
no days meet criteria:  ALCY
no days meet criteria:  ALG
no days meet criteria:  ABSI
no days meet criteria:  AMH.H
no days meet criteria:  ANSCU
no days meet criteria:  AHG
no days meet criteria:  AHT.F
no days meet criteria:  AGRIW
no days meet criteria:  AMPGW
no days meet criteria:  ALOT
no days meet criteria:  ALRS
no days meet criteria:  ANSC
no days meet criteria:  ALL.B
no days meet criteria:  ALSAR
no days meet criteria:  ANSCW
no days meet criteria:  AMH.G
no days meet criteria:  AKYA
no days meet criteria:  ALRN
no days meet criteria:  AMPY
no days meet criteria:  ALSAU
no days meet criteria:  ALMS
no days meet criteria:  ALTI
no days meet criteria:  ALTM
no days meet criteria:  AKTX
no days meet criteria:  ANL
no days meet criteria:  AE
no days meet criteria:  AHL.C
no days meet criteria:  ACNB
no day

KeyboardInterrupt: 

In [40]:
num_error_counter

1

In [3]:
# this code filters the tickers based upon tickers, days that fulfill criteria and adds market cap threshold
# it write parquet files for the filtered tickers and days

start = dt.now()
source_filtered_csv_folder = "G:/My Drive/Backtesting/FirstRateData/Stocks/filtered/filtered-1day-csv/"
# test folder with one file 
#source_filtered_csv_folder = "G:/My Drive/Backtesting/FirstRateData/Stocks/filtered/test-filtered-1day-csv/"

destination_parquet_folder = "G:/My Drive/Backtesting/FirstRateData/Stocks/filtered/filtered-parquet"
# test folder for destination parquet files 
#destination_parquet_folder ="G:/My Drive/Backtesting/FirstRateData/Stocks/filtered/test-filtered-parquet"

source_parquet_folder = "C:/Users/dansk/OneDrive/Documents/source-stocks1min-parquet/"
# read in symbols that did not meet criteria so I can exclude them from the parquet files
df_excluded_tickers = pd.read_csv(stats_folder + "num_zero_days_tickers.csv")
#print(df_excluded_tickers)
excluded_tickers = df_excluded_tickers['Symbol'].to_list()
#print(excluded_tickers)

tickers_included_count = 0
tickers_excluded_count = 0
for filename in os.listdir(source_filtered_csv_folder):

    try:
        filename_str = str(filename)
        ticker = filename_str.split('_')[0]
        #print(ticker)
        if ticker in excluded_tickers:
            tickers_excluded_count += 1
            continue
        market_cap_url = f'https://api.polygon.io/v3/reference/tickers/{ticker}?apiKey={API_key}'
        
    
    
        response_cap = requests.get(market_cap_url)
        #print(response_bars)
        #print(response_bars.json())
        if response_cap.status_code != 200:
            #print(f'Error: {response_cap.status_code}')
            #print(f'Error message: {response_cap.json()}')
            tickers_excluded_count += 1
            continue
            
        if 'results' not in response_cap.json() or len(response_cap.json()['results']) == 0:
            #print('No results for marketcap api', ticker)
            tickers_excluded_count += 1
            continue
            
        market_cap  = response_cap.json()['results']['market_cap']
        print (ticker, market_cap)
        if market_cap < market_cap_threshold:
            print(f'Market cap too low: {ticker}')
            tickers_excluded_count += 1
            continue



        try:
             df = pd.read_csv(source_filtered_csv_folder  + filename_str)
        except pd.errors.ParserError as e:
            print(f"Parser error encountered while reading {filename_str}: {e}")
            continue

        df_all_conditions = df[df['all_conditions'] == True].copy()
        # Convert 'Datetime' to datetime objects, coercing errors
        df_all_conditions['Datetime'] = pd.to_datetime(df_all_conditions['Datetime'], format='mixed', errors='coerce')

        # Identify rows with invalid or missing dates
        invalid_dates = df_all_conditions[df_all_conditions['Datetime'].isna()]
        if not invalid_dates.empty:
            print("Rows with bad, missing, or incorrectly formatted dates:")
            print(invalid_dates)
        #print('length invalid_dates', len(invalid_dates))
        # Drop rows with NaT (invalid dates)
        df_all_conditions = df_all_conditions.dropna(subset=['Datetime'])

        # Extract unique dates and format them
        list_of_dates = df_all_conditions['Datetime'].dt.strftime('%Y-%m-%d').tolist()

        # check if there are any misformatted dates
        
        
        #print(type(list_of_dates))
       
        #print(list_of_dates)
        
        ticker_parquet_folder_source = ticker + "_1min_parquet/"
        destination_sub_folder = os.path.join(destination_parquet_folder, f'{ticker}_1min_parquet')
        destination_sub_folder = os.path.normpath(destination_sub_folder)
        destination_sub_folder = destination_sub_folder.replace(os.sep, '/')

        #print('destination_sub_folder', destination_sub_folder)
        
        if not os.path.exists(destination_sub_folder):
            os.makedirs(destination_sub_folder)
        tickers_included_count += 1
        for date_entry in list_of_dates:

            try:
                #print(date_entry)
                
                
                ticker_parquet_filename = date_entry + f"_{ticker}" + "_1min.parquet"
                source_file = os.path.join(source_parquet_folder, ticker_parquet_folder_source, ticker_parquet_filename)
                source_file = os.path.normpath(source_file)
                source_file = source_file.replace(os.sep, '/')
                #print('source_file', source_file)
                destination_file = os.path.join(destination_sub_folder,  ticker_parquet_filename)
                destination_file = destination_file.replace(os.sep, '/')    
                #print('destination_file', destination_file)
                shutil.copy(source_file, destination_file)
            except Exception as e:
                print("error with date " + ticker, date_entry)
                tb = traceback.format_exc()
                print(f'An error occurred in date loop: {str(e)}')
                continue
                
            
            
        
    except Exception as e:
        
        print("error with " + ticker)
        
        print(f'An error occurred in ticker loop: {str(e)}')
        continue
print('tickers_included_count', tickers_included_count)
print('tickers_excluded_count', tickers_excluded_count)

end = dt.now()
print(start)
print(end)
print(end-start)
        
        




AMGN 145969244040.12
error with date AMGN 2024-12-11
An error occurred in date loop: [Errno 2] No such file or directory: 'C:/Users/dansk/OneDrive/Documents/source-stocks1min-parquet/AMGN_1min_parquet/2024-12-11_AMGN_1min.parquet'
error with date AMGN 2024-12-12
An error occurred in date loop: [Errno 2] No such file or directory: 'C:/Users/dansk/OneDrive/Documents/source-stocks1min-parquet/AMGN_1min_parquet/2024-12-12_AMGN_1min.parquet'
error with date AMGN 2024-12-13
An error occurred in date loop: [Errno 2] No such file or directory: 'C:/Users/dansk/OneDrive/Documents/source-stocks1min-parquet/AMGN_1min_parquet/2024-12-13_AMGN_1min.parquet'
ALC 41729402000.0
error with date ALC 2024-12-11
An error occurred in date loop: [Errno 2] No such file or directory: 'C:/Users/dansk/OneDrive/Documents/source-stocks1min-parquet/ALC_1min_parquet/2024-12-11_ALC_1min.parquet'
error with date ALC 2024-12-12
An error occurred in date loop: [Errno 2] No such file or directory: 'C:/Users/dansk/OneDrive