# HMM Prep Candlestick Observations

In preparation for the Hidden Markov Model (HMM) we need to prepare the candlestick observations. 
 
The idea is to see if candlestick patterns prior to a prediction can help help with trusting the trend of the prediction.
- Are there candlestick patterns that substantiate the prediction?
    - If so, to what degree? 
- Are there candlestick patterns that contradict the prediction?
    - If so, to what degree? 


In [1]:
# Import Libraries
import os.path
import numpy as np
import talib as ta
import inspect
import pandas as pd
import logging
import sys
import json
import dill
import pandas as pd
import berkeleydb as bdb
import matplotlib.pyplot as plt
import copy

from pricepredict import PricePredict
from datetime import datetime, timedelta


2025-01-22 06:37:29.540640: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-22 06:37:29.558550: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-22 06:37:29.574238: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-22 06:37:29.578381: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-22 06:37:29.590277: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# Use an Object Cache to reduce the prep time for creating and loading the PricePredict objects.
if 'ObjCache' not in globals():
    global ObjCache
    ObjCache = bdb.btopen('ppo_cache.db', 'c')

DirPPO = '../ppo/'
def get_ppo(symbol: str, period: str, refresh_ppo: bool = False):
    
    global ObjCache

    # print(f'Type of ObjCache: {type(ObjCache)}')

    ppo_name = symbol + '_' + period

    # One-off
    # del ObjCache[b'AAPL_D']

    if refresh_ppo:
        if bytes(ppo_name, 'latin1') in ObjCache.keys():
            del ObjCache[bytes(ppo_name, 'latin1')]

    if bytes(ppo_name, 'latin1') in ObjCache.keys():
        print(f"Using Cached PPO: {ppo_name}")    
        ppo = PricePredict.unserialize(ObjCache[bytes(ppo_name, 'latin1')])
        return 'None', ppo
    
    file_name_starts_with = symbol + '_' + period
    # Find all PPO files for the symbol in the PPO directory
    ppo_files = [f for f in os.listdir(DirPPO) if f.startswith(file_name_starts_with) and f.endswith('.dilz')]
    ppo = None
    if len(ppo_files) > 0:
        # Sort the files by date
        ppo_files.sort()
        # Get the latest PPO file
        ppo_file = ppo_files[-1]
        # Unpickle the PPO file using dilz
        print(f"Reading PPO File: {ppo_file}")
        with open(DirPPO + ppo_file, 'rb') as f:
            f_obj = f.read()
            ppo = PricePredict.unserialize(f_obj)
            
    if ppo is None:
        ppo_file = ppo_name
        print(f"Creating PPO: {ppo_file}")
        ppo = PricePredict(symbol,
                           model_dir='../models/',
                           chart_dir='../charts/',
                           preds_dir='../predictions/',
                           period=period)
        # Train the models on 5 yeas of data...
        end_dt = datetime.now()
        start_dt = end_dt - timedelta(days=5*400)
        end_date = end_dt.strftime('%Y-%m-%d')
        start_date = start_dt.strftime('%Y-%m-%d')
        ppo.fetch_train_and_predict(ppo.ticker, 
                                    start_date, end_date, 
                                    start_date, end_date,
                                    period=PricePredict.PeriodWeekly,
                                    force_training=False,
                                    use_curr_model=True,
                                    save_model=False)
        
    # Cache the ppo
    ObjCache[bytes(ppo_name, 'latin1')] = ppo.serialize_me()

    return ppo_file, ppo


In [3]:

# Given a variable that contains a wrapped C library function, the following code returns the name of the wrapped function.
# - inspect.getattr_static(func, '__wrapped__')

def get_candlestick_observations(df):

    if type(df) != pd.core.frame.DataFrame:
        e_txt = "Error: get_all_candlestick_observations() requires a dataframe as input."
        raise Exception(e_txt)
    # Check that Open, High, Low, and Close columns exist in the dataframe
    elif 'Open' not in df.columns or 'High' not in df.columns or 'Low' not in df.columns or 'Close' not in df.columns:
        e_txt = "Error: get_all_candlestick_observations() requires Open, High, Low, and Close columns in the dataframe."
        raise Exception(e_txt)
    
    ta_funcs = [ ta.CDL2CROWS, ta.CDL3BLACKCROWS, ta.CDL3INSIDE, ta.CDL3LINESTRIKE, ta.CDL3OUTSIDE, ta.CDL3STARSINSOUTH, ta.CDL3WHITESOLDIERS, ta.CDLABANDONEDBABY, ta.CDLADVANCEBLOCK, ta.CDLBELTHOLD, ta.CDLBREAKAWAY, ta.CDLCLOSINGMARUBOZU, ta.CDLCONCEALBABYSWALL, ta.CDLCOUNTERATTACK, ta.CDLDARKCLOUDCOVER, ta.CDLDOJI, ta.CDLDOJISTAR, ta.CDLDRAGONFLYDOJI, ta.CDLENGULFING, ta.CDLEVENINGDOJISTAR, ta.CDLEVENINGSTAR, ta.CDLGAPSIDESIDEWHITE, ta.CDLGRAVESTONEDOJI, ta.CDLHAMMER, ta.CDLHANGINGMAN, ta.CDLHARAMI, ta.CDLHARAMICROSS, ta.CDLHIGHWAVE, ta.CDLHIKKAKE, ta.CDLHIKKAKEMOD, ta.CDLHOMINGPIGEON, ta.CDLIDENTICAL3CROWS, ta.CDLINNECK, ta.CDLINVERTEDHAMMER, ta.CDLKICKING, ta.CDLKICKINGBYLENGTH, ta.CDLLADDERBOTTOM, ta.CDLLONGLEGGEDDOJI, ta.CDLLONGLINE, ta.CDLMARUBOZU, ta.CDLMATCHINGLOW, ta.CDLMATHOLD, ta.CDLMORNINGDOJISTAR, ta.CDLMORNINGSTAR, ta.CDLONNECK, ta.CDLPIERCING, ta.CDLRICKSHAWMAN, ta.CDLRISEFALL3METHODS, ta.CDLSEPARATINGLINES, ta.CDLSHOOTINGSTAR, ta.CDLSHORTLINE, ta.CDLSPINNINGTOP, ta.CDLSTALLEDPATTERN, ta.CDLSTICKSANDWICH, ta.CDLTAKURI, ta.CDLTASUKIGAP, ta.CDLTHRUSTING, ta.CDLTRISTAR, ta.CDLUNIQUE3RIVER, ta.CDLUPSIDEGAP2CROWS ]
    
    # Create a list of the c-lib function names
    func_names = [inspect.getattr_static(func, '__wrapped__').__name__ for func in ta_funcs]

    opens = df['Open']
    highs = df['High']
    lows = df['Low']
    closes = df['Close']
    
    # Run each function on the dataframe
    obs = {}
    for i, func in enumerate(ta_funcs):
        try:
            obs[func_names[i]] = func(opens, highs, lows, closes)
        except Exception as e:
            e_txt = f"Error: Exception in get_all_candlestick_observations() on ta-lib function [{func_names[i]}]: {e}"
    # Create a list of the candlestick patterns that were observed
    obs_list = []
    for key in obs.keys():
        if obs[key].sum() != 0:    
            obs_list.append(key)
            
    # Return the raw observations and the list of observed candlestick patterns
    return obs, obs_list

ppo_file, ppo = get_ppo('AAPL', 'D', refresh_ppo=True)
# Perform an a price prediction will also run an analysis of the prediction, 
# this will fill in ppo.trends_cose and ppo.trends_adj_close
ppo.predict_price(None, start_date=ppo.date_start, end_date=ppo.date_end)
raw_cdl_obs, cdl_obs = get_candlestick_observations(ppo.orig_data)

# ---------- Output Results ----------

tbl = pd.DataFrame(columns=['Candle Pattern','Occurrences','Data Length'])
# Count the number of observations for each pattern in cdl_obs
for cdl in cdl_obs:
    data_arr = [cdl, np.count_nonzero(raw_cdl_obs[cdl]), len(raw_cdl_obs[cdl])]    
    new_row = pd.DataFrame([data_arr], columns=tbl.columns)
    tbl = pd.concat([tbl, new_row], ignore_index=True)
tbl

Creating PPO: AAPL_D


[*********************100%***********************]  1 of 1 completed
I0000 00:00:1737556653.194741 1435231 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-22 06:37:33.225361: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Candle Pattern,Occurrences,Data Length
0,CDL2CROWS,2,287
1,CDL3INSIDE,7,287
2,CDL3OUTSIDE,13,287
3,CDL3WHITESOLDIERS,1,287
4,CDLADVANCEBLOCK,6,287
5,CDLBELTHOLD,45,287
6,CDLCLOSINGMARUBOZU,57,287
7,CDLDARKCLOUDCOVER,3,287
8,CDLDOJI,32,287
9,CDLDOJISTAR,4,287


In [13]:
len(ppo.pred_rank), len(ppo.trends_corr), len(ppo.date_data), ppo.dateEnd_pred, ppo.date_data.iloc[-1]

(272, 271, 287, '2025-01-22', Timestamp('2025-01-26 00:00:00'))