In [1]:
%matplotlib inline
from IPython.display import display
from matplotlib import pyplot as plt

import os
import sys
import torch
import random
import pandas as pd
import numpy as np
import pandas_datareader.data as web
import seaborn as sns
from datetime import datetime

#Plotting 
from pandas.plotting import scatter_matrix

#Libraries for Statistical Models
import statsmodels.api as sm

sys.path.append('../')
from MyPyUtil.logconf import logging
from MyPyUtil.util import show_more_rows, seed_everything

#Diable the warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
# logging
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)

seed_everything()

pd.options.display.max_columns = None
pd.options.display.expand_frame_repr = False
pd.options.display.float_format = "{:.3f}".format

# max output width in numpy
np.set_printoptions(
    threshold=sys.maxsize,  # 20,
    edgeitems=10,
    linewidth=140,
    formatter=dict(float=lambda x: "%.3g" % x),
)  # float arrays %.3g

task_name = "StockPricePrediction"  # get_filename_of_ipynb()
data_dir = f"{os.getcwd()}/../data/"
log_dir_base = f"{os.getcwd()}/runs/{task_name}"

if os.path.exists(log_dir_base) == False:
    os.makedirs(log_dir_base, exist_ok=True)

log_dir = log_dir_base
print(f"{data_dir}\n{log_dir}")

/mnt/AIWorkSpace/work/AI4StockMarket/StockPricePrediction/../data/
/mnt/AIWorkSpace/work/AI4StockMarket/StockPricePrediction/runs/StockPricePrediction


**download stocks list from ftp://ftp.nasdaqtrader.com/symboldirectory**  
[**reference: nasdaqtrader**](https://www.nasdaqtrader.com/trader.aspx?id=symboldirdefs)

- **Market Category:** The category assigned to the issue by NASDAQ based on Listing Requirements. Values:  
   Q = NASDAQ Global Select MarketSM  
   G = NASDAQ Global MarketSM  
   S = NASDAQ Capital Market

- **Test Issue:** Indicates whether or not the security is a test security. Values:  
   Y = yes, it is a test issue.  
   N = no, it is not a test issue.

- **Financial Status:** Indicates when an issuer has failed to submit its regulatory filings on a timely basis, has failed to meet NASDAQ's continuing listing standards, and/or has filed for bankruptcy. Values include:
  D = Deficient: Issuer Failed to Meet NASDAQ Continued Listing Requirements  
  E = Delinquent: Issuer Missed Regulatory Filing Deadline  
  Q = Bankrupt: Issuer Has Filed for Bankruptcy  
  N = Normal (Default): Issuer Is NOT Deficient, Delinquent, or Bankrupt.  
  G = Deficient and Bankrupt  
  H = Deficient and Delinquent  
  J = Delinquent and Bankrupt  
  K = Deficient, Delinquent, and Bankrupt


In [3]:
from ftplib import FTP

# Connect to the FTP server
ftp = FTP("ftp.nasdaqtrader.com")
ftp.login(user="anonymous", passwd="aaa@aaa.com")

# Change to the desired directory
ftp.cwd("symboldirectory")

# Download a file (e.g., 'example.txt') to the local machine
with open("nasdaqlisted.txt", "wb") as local_file:
    ftp.retrbinary("RETR nasdaqlisted.txt", local_file.write)

# Close the FTP connection
ftp.quit()

'221 Goodbye.'

In [4]:
stocks_list = pd.read_csv("nasdaqlisted.txt", delimiter="|")
stocks_list.drop(index=stocks_list.index[-1], axis=0, inplace=True)
stocks_list = stocks_list[
    (stocks_list["Test Issue"] == "N")
    & (stocks_list["ETF"] == "N")
    & (stocks_list["Financial Status"] == "N")
]
display(stocks_list)
stocks_list.groupby("Market Category").count()["Symbol"]

Unnamed: 0,Symbol,Security Name,Market Category,Test Issue,Financial Status,Round Lot Size,ETF,NextShares
0,AACG,ATA Creativity Global - American Depositary Sh...,G,N,N,100.000,N,N
1,AACI,Armada Acquisition Corp. I - Common Stock,G,N,N,100.000,N,N
2,AACIU,Armada Acquisition Corp. I - Unit,G,N,N,100.000,N,N
3,AACIW,Armada Acquisition Corp. I - Warrant,G,N,N,100.000,N,N
4,AADI,"Aadi Bioscience, Inc. - Common Stock",S,N,N,100.000,N,N
...,...,...,...,...,...,...,...,...
4919,ZURA,Zura Bio Limited - Class A Ordinary shares,S,N,N,100.000,N,N
4920,ZURAW,Zura Bio Limited - Warrant,S,N,N,100.000,N,N
4921,ZVRA,"Zevra Therapeutics, Inc. - Common Stock",Q,N,N,100.000,N,N
4927,ZYME,Zymeworks Inc. - Common Stock,Q,N,N,100.000,N,N


Market Category
G     829
Q    1545
S    1280
Name: Symbol, dtype: int64

In [5]:
display(stocks_list.isnull().sum())
NaN_symbol = stocks_list[stocks_list["Symbol"].isnull()]
display(NaN_symbol)

Symbol              1
Security Name       0
Market Category     0
Test Issue          0
Financial Status    0
Round Lot Size      0
ETF                 0
NextShares          0
dtype: int64

Unnamed: 0,Symbol,Security Name,Market Category,Test Issue,Financial Status,Round Lot Size,ETF,NextShares
3003,,Nano Labs Ltd - Class A Ordinary Shares,G,N,N,100.0,N,N


In [6]:
stocks_list.drop(NaN_symbol.index, inplace=True)
display(stocks_list)
display(stocks_list.groupby("Market Category").count()["Symbol"])

Unnamed: 0,Symbol,Security Name,Market Category,Test Issue,Financial Status,Round Lot Size,ETF,NextShares
0,AACG,ATA Creativity Global - American Depositary Sh...,G,N,N,100.000,N,N
1,AACI,Armada Acquisition Corp. I - Common Stock,G,N,N,100.000,N,N
2,AACIU,Armada Acquisition Corp. I - Unit,G,N,N,100.000,N,N
3,AACIW,Armada Acquisition Corp. I - Warrant,G,N,N,100.000,N,N
4,AADI,"Aadi Bioscience, Inc. - Common Stock",S,N,N,100.000,N,N
...,...,...,...,...,...,...,...,...
4919,ZURA,Zura Bio Limited - Class A Ordinary shares,S,N,N,100.000,N,N
4920,ZURAW,Zura Bio Limited - Warrant,S,N,N,100.000,N,N
4921,ZVRA,"Zevra Therapeutics, Inc. - Common Stock",Q,N,N,100.000,N,N
4927,ZYME,Zymeworks Inc. - Common Stock,Q,N,N,100.000,N,N


Market Category
G     829
Q    1545
S    1280
Name: Symbol, dtype: int64

In [7]:
sectors = pd.read_csv("../resources/sectors.csv").fillna("")
sectors.drop(columns=sectors.columns[0], axis=1, inplace=True)
display(sectors.groupby("Sector").count())

Unnamed: 0_level_0,Symbol
Sector,Unnamed: 1_level_1
,190
Basic Materials,88
Communication Services,183
Consumer Cyclical,324
Consumer Defensive,152
Energy,75
Financial Services,917
Healthcare,977
Industrials,353
Real Estate,112


In [8]:
import yfinance as yfin

tmp_list = stocks_list.merge(sectors, how="left", on="Symbol")
stocks_without_sector = tmp_list[tmp_list.Sector.isna()]
symbols_without_sector = stocks_without_sector.Symbol.values

retrived_tickers_info = [yfin.Ticker(symbol) for symbol in symbols_without_sector]
retrived_sectors = pd.DataFrame(
    data={
        "Symbol": symbols_without_sector,
        "Sector": [
            t.info["sector"] if "sector" in t.info.keys() else ""
            for t in retrived_tickers_info
        ],
    }
)

display(retrived_sectors)

Unnamed: 0,Symbol,Sector


In [9]:
new_sectors = retrived_sectors.merge(sectors, on=["Symbol", "Sector"], how="outer")
new_sectors.to_csv("../resources/sectors.csv")
new_sectors

Unnamed: 0,Symbol,Sector
0,A,Healthcare
1,AACG,Consumer Defensive
2,AACI,Financial Services
3,AACIU,Financial Services
4,AACIW,Financial Services
...,...,...
3996,ZURA,Healthcare
3997,ZURAW,
3998,ZVRA,Healthcare
3999,ZYME,Healthcare


In [10]:
stocks_list = stocks_list.merge(new_sectors, how="left", on="Symbol")

display(stocks_list)
stocks_list.groupby("Sector").Symbol.count()

Unnamed: 0,Symbol,Security Name,Market Category,Test Issue,Financial Status,Round Lot Size,ETF,NextShares,Sector
0,AACG,ATA Creativity Global - American Depositary Sh...,G,N,N,100.000,N,N,Consumer Defensive
1,AACI,Armada Acquisition Corp. I - Common Stock,G,N,N,100.000,N,N,Financial Services
2,AACIU,Armada Acquisition Corp. I - Unit,G,N,N,100.000,N,N,Financial Services
3,AACIW,Armada Acquisition Corp. I - Warrant,G,N,N,100.000,N,N,Financial Services
4,AADI,"Aadi Bioscience, Inc. - Common Stock",S,N,N,100.000,N,N,Healthcare
...,...,...,...,...,...,...,...,...,...
3649,ZURA,Zura Bio Limited - Class A Ordinary shares,S,N,N,100.000,N,N,Healthcare
3650,ZURAW,Zura Bio Limited - Warrant,S,N,N,100.000,N,N,
3651,ZVRA,"Zevra Therapeutics, Inc. - Common Stock",Q,N,N,100.000,N,N,Healthcare
3652,ZYME,Zymeworks Inc. - Common Stock,Q,N,N,100.000,N,N,Healthcare


Sector
                          188
Basic Materials            68
Communication Services    177
Consumer Cyclical         284
Consumer Defensive        124
Energy                     55
Financial Services        864
Healthcare                933
Industrials               299
Real Estate                86
Technology                540
Utilities                  36
Name: Symbol, dtype: int64

In [11]:
stocks_with_empty_sector = stocks_list[stocks_list.Sector == ""]
stocks_list.drop(stocks_with_empty_sector.index, inplace=True)
display(stocks_list)

Unnamed: 0,Symbol,Security Name,Market Category,Test Issue,Financial Status,Round Lot Size,ETF,NextShares,Sector
0,AACG,ATA Creativity Global - American Depositary Sh...,G,N,N,100.000,N,N,Consumer Defensive
1,AACI,Armada Acquisition Corp. I - Common Stock,G,N,N,100.000,N,N,Financial Services
2,AACIU,Armada Acquisition Corp. I - Unit,G,N,N,100.000,N,N,Financial Services
3,AACIW,Armada Acquisition Corp. I - Warrant,G,N,N,100.000,N,N,Financial Services
4,AADI,"Aadi Bioscience, Inc. - Common Stock",S,N,N,100.000,N,N,Healthcare
...,...,...,...,...,...,...,...,...,...
3648,ZUMZ,Zumiez Inc. - Common Stock,Q,N,N,100.000,N,N,Consumer Cyclical
3649,ZURA,Zura Bio Limited - Class A Ordinary shares,S,N,N,100.000,N,N,Healthcare
3651,ZVRA,"Zevra Therapeutics, Inc. - Common Stock",Q,N,N,100.000,N,N,Healthcare
3652,ZYME,Zymeworks Inc. - Common Stock,Q,N,N,100.000,N,N,Healthcare


## Load Stock Data


In [12]:
new_sectors.set_index("Symbol", inplace=True)
new_sectors

Unnamed: 0_level_0,Sector
Symbol,Unnamed: 1_level_1
A,Healthcare
AACG,Consumer Defensive
AACI,Financial Services
AACIU,Financial Services
AACIW,Financial Services
...,...
ZURA,Healthcare
ZURAW,
ZVRA,Healthcare
ZYME,Healthcare


In [13]:
from collections import namedtuple
from MyPyUtil.util import is_contained
from tqdm import tqdm

tick_data_info = namedtuple("tick_data_info", "idx symbol sector tick_data")


def load_stk_data(stk_symbols, start, end, empty_vol_threshold):

    ticks_data = []

    error_download = []
    error_sector_not_found = []
    error_numerical = []
    error_volumn = []
    error_empty = []
    for idx, symbol in enumerate(tqdm(stk_symbols)):
        if new_sectors.loc[symbol].Sector == "":
            log.warning(f"{symbol}: Its sector cannot be found")
            error_sector_not_found.append(symbol)
            continue

        stk_file = f"{data_dir}{symbol}_{start.strftime('%Y%m%d')}-{end.strftime('%Y%m%d')}.csv"
        bLoad = False
        if os.path.isfile(stk_file):
            try:
                _stk_data = pd.read_csv(stk_file).set_index("Date")
                bLoad = True
                log.info(f"read {stk_file} completely!")
            except:
                None
        if bLoad == False:
            # _stk_data = web.get_data_yahoo(stk_tickers, start, end)
            try:
                _stk_data = yfin.download([symbol], start, end).dropna()
                _stk_data.to_csv(stk_file)
                log.info(
                    f"download {symbol} from yfin and write to {stk_file} completely!"
                )
            except:
                error_download.append(symbol)
                continue
        statistics = _stk_data.describe()
        if is_contained(statistics, 0):
            if is_contained(
                statistics.loc[:, ["Open", "High", "Low", "Close", "Adj Close"]], 0
            ) or is_contained(statistics.loc["std"], 0):
                log.warning(f"{symbol}: contains numerical errors. Ignore it.")
                error_numerical.append(symbol)
                continue
            else:
                empty_vol_index = _stk_data[_stk_data["Volume"] == 0].index
                if len(empty_vol_index) > empty_vol_threshold:
                    log.warning(
                        f"The total volume with a value of zero ({len(empty_vol_index)}) is greater than the threshold({empty_vol_threshold}). Ignore it."
                    )
                    error_volumn.append(symbol)
                    continue
                log.info(
                    f"A total of {len(empty_vol_index)} volume values ​​are zero. Delete these data."
                )

                cleaned_data = _stk_data.drop(empty_vol_index)
                log.info(
                    f"The cleaned data size is {len(cleaned_data)}. The original data size is {len(_stk_data)}."
                )
                if len(cleaned_data) == 0:
                    error_empty.append(symbol)
                    log.warning(f"The cleaned data size is {len(cleaned_data)}.")
                    continue
                _stk_data = cleaned_data

        ticks_data.append(
            tick_data_info(idx, symbol, new_sectors.loc[symbol].Sector, _stk_data)
        )
        log.info(f"{symbol}, size:{len(_stk_data)}")
    return ticks_data, (
        error_download,
        error_sector_not_found,
        error_numerical,
        error_volumn,
        error_empty,
    )

In [14]:
log.setLevel(logging.ERROR)

empty_vol_threshold = 10
stks_data, (
    error_download,
    error_sector_not_found,
    error_numerical,
    error_volumn,
    error_empty,
) = load_stk_data(
    stocks_list.Symbol.values,
    datetime(2014, 1, 1),
    datetime(2023, 12, 31),
    empty_vol_threshold,
)
display(len(stks_data))

log.setLevel(logging.INFO)

100%|██████████| 3466/3466 [00:16<00:00, 205.49it/s]


2259

In [26]:
stks_data[0].tick_data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-02-16,69.750,72.750,64.200,67.500,67.500,1527
2018-02-20,65.700,66.000,63.750,66.000,66.000,353
2018-02-21,60.000,74.850,60.000,71.250,71.250,733
2018-02-22,67.650,71.250,66.750,68.745,68.745,213
2018-02-23,65.250,67.500,65.250,67.500,67.500,360
...,...,...,...,...,...,...
2023-12-22,1.980,2.060,1.900,1.970,1.970,725600
2023-12-26,2.030,2.095,1.990,2.010,2.010,477200
2023-12-27,2.010,2.100,2.010,2.080,2.080,608300
2023-12-28,2.110,2.210,2.080,2.100,2.100,699400


In [31]:
full_calandar = (
    pd.DataFrame({"Date": pd.date_range(start="2014-01-01", end="2023-12-31")})
    .reset_index()
    .set_index("Date")
)

full_calandar.columns = ["time_id"]
full_calandar

Unnamed: 0_level_0,time_id
Date,Unnamed: 1_level_1
2014-01-01,0
2014-01-02,1
2014-01-03,2
2014-01-04,3
2014-01-05,4
...,...
2023-12-27,3647
2023-12-28,3648
2023-12-29,3649
2023-12-30,3650
