In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import talib
import pandas as pd
from datetime import datetime
import plotly.graph_objects as go
from tqdm import tqdm
from glob import glob

import plotly
import plotly.offline as py
import plotly.graph_objs as go

# My libs
from src.utils.data_util import DataUtil
from src.strategies.candle_stick import CandleStick

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

In [2]:
list_cdls = talib.get_function_groups()['Pattern Recognition']

In [3]:
# list_cdls = ['CDLENGULFING', 'CDL3OUTSIDE', 'CDL3INSIDE', 'CDLHARAMI', 'CDLDRAGONFLYDOJI']

```json
'CDL2CROWS', 'CDL3BLACKCROWS', 'CDL3INSIDE', 'CDL3LINESTRIKE', 'CDL3OUTSIDE', 'CDL3STARSINSOUTH', 'CDL3WHITESOLDIERS', 'CDLABANDONEDBABY', 'CDLADVANCEBLOCK', 'CDLBELTHOLD', 'CDLBREAKAWAY', 'CDLCLOSINGMARUBOZU', 'CDLCONCEALBABYSWALL', 'CDLCOUNTERATTACK', 'CDLDARKCLOUDCOVER', 'CDLDOJI', 'CDLDOJISTAR', 'CDLDRAGONFLYDOJI', 'CDLENGULFING', 'CDLEVENINGDOJISTAR', 'CDLEVENINGSTAR', 'CDLGAPSIDESIDEWHITE', 'CDLGRAVESTONEDOJI', 'CDLHAMMER', 'CDLHANGINGMAN', 'CDLHARAMI', 'CDLHARAMICROSS', 'CDLHIGHWAVE', 'CDLHIKKAKE', 'CDLHIKKAKEMOD', 'CDLHOMINGPIGEON', 'CDLIDENTICAL3CROWS', 'CDLINNECK', 'CDLINVERTEDHAMMER', 'CDLKICKING', 'CDLKICKINGBYLENGTH', 'CDLLADDERBOTTOM', 'CDLLONGLEGGEDDOJI', 'CDLLONGLINE', 'CDLMARUBOZU', 'CDLMATCHINGLOW', 'CDLMATHOLD', 'CDLMORNINGDOJISTAR', 'CDLMORNINGSTAR', 'CDLONNECK', 'CDLPIERCING', 'CDLRICKSHAWMAN', 'CDLRISEFALL3METHODS', 'CDLSEPARATINGLINES', 'CDLSHOOTINGSTAR', 'CDLSHORTLINE', 'CDLSPINNINGTOP', 'CDLSTALLEDPATTERN', 'CDLSTICKSANDWICH', 'CDLTAKURI', 'CDLTASUKIGAP', 'CDLTHRUSTING', 'CDLTRISTAR', 'CDLUNIQUE3RIVER', 'CDLUPSIDEGAP2CROWS', 'CDLXSIDEGAP3METHODS'
```

In [4]:
data_util = DataUtil()
# Selected candlestick patterns initiation
cdl_pattern = CandleStick(list_cdls)

## Important patterns
* CDL3OUTSIDE : This may not work in resistance and support levels. such as ema, vwap
* CDLGAPSIDESIDEWHITE: Very nice move 2 times.
* CDL3INSIDE
* CDLHARAMI: Bearish days Harami (-) has pretty strong move on down side
* DRAGONFLYDOJI: Looks quite a nice reversal pattern. It has a huge moves

# Data Collection

In [5]:
def sing_day_data(file_path):
    """
    Load single data of data with pre processing (epoch, date string)
    """
    df = pd.read_csv(file_path)
    df = data_util.pre_data_process(df)
    return df

In [6]:
# Load all the TF files
all_files = glob('data/STK/1_min/TSLA/*')

source_lst = []
for path in tqdm(all_files):
    source_lst.append(sing_day_data(path))

source = pd.concat(source_lst).reset_index(drop=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 568/568 [00:29<00:00, 19.48it/s]


# Analysis

In [11]:
data = source.copy()

In [12]:
# Generate candle stick patterns for given input
data = cdl_pattern.generate_pattern(data)

In [13]:
data.head(100)

Unnamed: 0,date,open,high,low,close,volume,barCount,average,time,date_str,date_epoch,CDL2CROWS,CDL3BLACKCROWS,CDL3INSIDE,CDL3LINESTRIKE,CDL3OUTSIDE,CDL3STARSINSOUTH,CDL3WHITESOLDIERS,CDLABANDONEDBABY,CDLADVANCEBLOCK,CDLBELTHOLD,CDLBREAKAWAY,CDLCLOSINGMARUBOZU,CDLCONCEALBABYSWALL,CDLCOUNTERATTACK,CDLDARKCLOUDCOVER,CDLDOJI,CDLDOJISTAR,CDLDRAGONFLYDOJI,CDLENGULFING,CDLEVENINGDOJISTAR,CDLEVENINGSTAR,CDLGAPSIDESIDEWHITE,CDLGRAVESTONEDOJI,CDLHAMMER,CDLHANGINGMAN,CDLHARAMI,CDLHARAMICROSS,CDLHIGHWAVE,CDLHIKKAKE,CDLHIKKAKEMOD,CDLHOMINGPIGEON,CDLIDENTICAL3CROWS,CDLINNECK,CDLINVERTEDHAMMER,CDLKICKING,CDLKICKINGBYLENGTH,CDLLADDERBOTTOM,CDLLONGLEGGEDDOJI,CDLLONGLINE,CDLMARUBOZU,CDLMATCHINGLOW,CDLMATHOLD,CDLMORNINGDOJISTAR,CDLMORNINGSTAR,CDLONNECK,CDLPIERCING,CDLRICKSHAWMAN,CDLRISEFALL3METHODS,CDLSEPARATINGLINES,CDLSHOOTINGSTAR,CDLSHORTLINE,CDLSPINNINGTOP,CDLSTALLEDPATTERN,CDLSTICKSANDWICH,CDLTAKURI,CDLTASUKIGAP,CDLTHRUSTING,CDLTRISTAR,CDLUNIQUE3RIVER,CDLUPSIDEGAP2CROWS,CDLXSIDEGAP3METHODS,9EMA,20EMA
0,2020-01-03 21:20:00,86.1,86.3,85.8,86.3,326,44,86.1268,21:20:00,2020-01-03,1578100800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,
1,2020-01-03 21:21:00,86.39,86.82,86.39,86.82,759,81,86.6296,21:21:00,2020-01-03,1578100860,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,
2,2020-01-03 21:22:00,86.86,86.9,86.6,86.74,565,70,86.783,21:22:00,2020-01-03,1578100920,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,
3,2020-01-03 21:23:00,86.62,86.78,86.3,86.3,850,87,86.5308,21:23:00,2020-01-03,1578100980,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,
4,2020-01-03 21:24:00,86.39,86.78,86.36,86.66,437,40,86.5134,21:24:00,2020-01-03,1578101040,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,
5,2020-01-03 21:25:00,86.76,87.2,86.76,87.14,778,89,86.968,21:25:00,2020-01-03,1578101100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,
6,2020-01-03 21:26:00,87.2,87.2,87.0,87.16,481,61,87.1234,21:26:00,2020-01-03,1578101160,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,
7,2020-01-03 21:27:00,87.1,87.52,87.1,87.4,866,105,87.3198,21:27:00,2020-01-03,1578101220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,
8,2020-01-03 21:28:00,87.46,87.82,87.46,87.82,1109,137,87.628,21:28:00,2020-01-03,1578101280,0,0,0,0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,86.926667,
9,2020-01-03 21:29:00,87.85,87.9,87.6,87.7,729,95,87.7478,21:29:00,2020-01-03,1578101340,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,87.081333,


## Find the most frequest pattern

In [9]:
# Move the columns to rows
candles = data.drop(['open', 'high', 'low', 'close', 'volume', 'barCount', 'average','time', 'date_str', 'date_epoch'], axis=1)
candles = candles.melt(id_vars=['date'],
                       var_name="cdl_pattern",
                       value_name="pattern_check")

In [10]:
candles = candles[candles['pattern_check'] != 0]

In [11]:
candles = candles.groupby(['cdl_pattern'])['date'].nunique().reset_index(name='pattern_count').sort_values(['pattern_count'], ascending=False)

In [12]:
candles.to_csv('insights/tsla_candle_patterns.csv', index=False)

## Find all the days that has specific pattern

In [230]:
data_for_visual = source.copy()
cdl_pat = 'CDLENGULFING'
list_cdls = [cdl_pat]
cdl_pattern = CandleStick(list_cdls)
data_for_visual = cdl_pattern.generate_pattern(data_for_visual)

In [231]:
# Find the days which has selected pattern by order
days_with_patterns = data_for_visual[data_for_visual[list_cdls].any(axis='columns')]['date_str']
days_with_patterns = days_with_patterns.value_counts().reset_index(name='count')

In [232]:
days_with_patterns.sort_values(['index'], ascending=False)

Unnamed: 0,index,count
133,2021-07-23,17
155,2021-07-22,16
143,2021-07-21,16
139,2021-07-20,16
213,2021-07-19,14
15,2021-07-16,48
105,2021-07-15,19
328,2021-07-14,11
222,2021-07-13,14
287,2021-07-12,12


In [233]:
day = days_with_patterns.iloc[1]['index']

In [235]:
date = '2021-07-23'

In [236]:
# Filter data by day
df = data_for_visual[data_for_visual['date_str'] == date]

In [237]:
# Filter data for annotation
filtered = df[df[cdl_pat].astype(bool)]

In [238]:
matched_times = df[df[cdl_pat].astype(bool)]['time']

In [239]:
def find_cld_name(x):
    return str(df[df['time'] == x].iloc[0][cdl_pat])

def find_y_axis(x):
    return df[df['time'] == x].iloc[0]['high']



# Draw the lines for reference
shapes_list = []
annotation_list = []
for time in matched_times:
    annotation_list.append(go.layout.Annotation(x=time, y=find_y_axis(time), 
                                                showarrow=True,  arrowhead=1, 
                                                arrowcolor="purple", arrowsize=2, arrowwidth=2, text=find_cld_name(time)))

In [240]:
fig = go.Figure()

fig.add_trace(go.Candlestick(x=df['time'],
        open=df['open'],
        high=df['high'],
        low=df['low'],
        close=df['close'],
        text=df['close']))

fig.update_layout(
    title=f'Analysis on {date}',
    yaxis_title='Price',
    annotations=annotation_list,
    xaxis=go.layout.XAxis(rangeslider=dict (visible = False))
)

import plotly.io as pio
pio.renderers.default = 'browser'
pio.show(fig)
# fig.show()

In [82]:
data_for_visual.head()

Unnamed: 0,date,open,high,low,close,volume,barCount,average,time,date_str,date_epoch,CDLBELTHOLD
0,2020-01-03 21:00:00,84.88,85.33,84.55,85.2,579,60,84.9156,21:00:00,2020-01-03,1578099600,0
1,2020-01-03 21:01:00,85.15,85.19,85.15,85.16,45,7,85.1598,21:01:00,2020-01-03,1578099660,0
2,2020-01-03 21:02:00,85.16,85.16,85.16,85.16,20,3,85.16,21:02:00,2020-01-03,1578099720,0
3,2020-01-03 21:03:00,85.12,85.15,85.1,85.1,20,4,85.121,21:03:00,2020-01-03,1578099780,0
4,2020-01-03 21:04:00,85.11,85.11,85.11,85.11,10,2,85.11,21:04:00,2020-01-03,1578099840,0


In [11]:
df.head()

Unnamed: 0,date,open,high,low,close,volume,barCount,average,time,date_str,date_epoch
0,2020-01-03 21:00:00,84.88,85.33,84.55,85.2,579,60,84.9156,21:00:00,2020-01-03,1578099600
1,2020-01-03 21:01:00,85.15,85.19,85.15,85.16,45,7,85.1598,21:01:00,2020-01-03,1578099660
2,2020-01-03 21:02:00,85.16,85.16,85.16,85.16,20,3,85.16,21:02:00,2020-01-03,1578099720
3,2020-01-03 21:03:00,85.12,85.15,85.1,85.1,20,4,85.121,21:03:00,2020-01-03,1578099780
4,2020-01-03 21:04:00,85.11,85.11,85.11,85.11,10,2,85.11,21:04:00,2020-01-03,1578099840


In [12]:
# Find all the dates 
dates = df['date_str'].unique()

In [13]:
def find_direction(t1, t2, t3, t4):
    """
    Find the next candle move up or down based on the identified pattern
    """
    if (t1 > 0) and (t2 > 0):
        return 2
    elif (t1 < 0)  and (t2 < 0):
        return 1
    else:
        return 0

"""
Find the next candles moves based on daily basis. Because If we don't process this by day. Then the next day candles may
create bias in the process
"""
generated_direction_by_day = []
for date in tqdm(dates):
    tmp_data = df[df['date_str'] == date].copy()
    tmp_data = tmp_data.sort_values(['date'])
    
    # Find the close price on another n time stamps, the timestamp may vary based on time frame (e.g. 1min, 2 min, 5 min)
    # This will help to understand the price movement up or down and by price
    # Change in price + or -
    tmp_data['t+1'] = tmp_data['close'].shift(-1) - tmp_data['close']
    tmp_data['t+2'] = tmp_data['close'].shift(-2) - tmp_data['close'].shift(-1)
    tmp_data['t+3'] = tmp_data['close'].shift(-3) - tmp_data['close'].shift(-2)
    tmp_data['t+4'] = tmp_data['close'].shift(-4) - tmp_data['close'].shift(-3)
    tmp_data['t+5'] = tmp_data['close'].shift(-5) - tmp_data['close'].shift(-4)
    
    # Find the direction based on 3 candles
    tmp_data['price_direction'] = tmp_data.apply(lambda x: find_direction(x['t+1'], x['t+2'], x['t+3'], x['t+4']), axis=1)
    
    generated_direction_by_day.append(tmp_data)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 386/386 [00:05<00:00, 67.67it/s]


In [14]:
# Combine processed data
df  = pd.concat(generated_direction_by_day).reset_index(drop=True)

In [15]:
df.head()

Unnamed: 0,date,open,high,low,close,volume,barCount,average,time,date_str,date_epoch,t+1,t+2,t+3,t+4,t+5,price_direction
0,2020-01-03 21:00:00,84.88,85.33,84.55,85.2,579,60,84.9156,21:00:00,2020-01-03,1578099600,0.0,0.0,-0.04,0.0,0.0,0
1,2020-01-03 21:00:00,84.88,85.33,84.55,85.2,579,60,84.9156,21:00:00,2020-01-03,1578099600,0.0,-0.04,0.0,0.0,0.0,0
2,2020-01-03 21:00:00,84.88,85.33,84.55,85.2,579,60,84.9156,21:00:00,2020-01-03,1578099600,-0.04,0.0,0.0,0.0,0.0,0
3,2020-01-03 21:01:00,85.15,85.19,85.15,85.16,45,7,85.1598,21:01:00,2020-01-03,1578099660,0.0,0.0,0.0,0.0,0.0,0
4,2020-01-03 21:01:00,85.15,85.19,85.15,85.16,45,7,85.1598,21:01:00,2020-01-03,1578099660,0.0,0.0,0.0,0.0,-0.06,0


In [16]:
# Generate candle stick patterns for given input
patterns = cdl_pattern.generate_pattern(df)

In [17]:
# Drop column which are not relevatn to the Prediction
patterns.drop(['date','open', 'high', 'low', 'close', 'volume', 'barCount', 'average',
       'time', 'date_str', 'date_epoch', 't+1', 't+2', 't+3', 't+4', 't+5'], axis=1, inplace=True)

In [18]:
# Filter by rows which contains atleast 1 found pattern
patterns['contains'] = patterns[list_cdls].any(axis='columns')
patterns = patterns[patterns['contains']].copy()
patterns.drop(['contains'], axis=1, inplace=True)

In [19]:
valid_classes = patterns[patterns['price_direction'].isin([1, 2])].copy()

In [20]:
other = patterns[patterns['price_direction'] == 0].copy()

In [21]:
other = other.sample(2800)

In [22]:
patterns= pd.concat([valid_classes, other]).reset_index(drop=True)

In [23]:
patterns['price_direction'].value_counts()

1    10115
2     9950
0     2800
Name: price_direction, dtype: int64

## Prepare model

In [24]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

In [25]:
from matplotlib import pyplot

In [26]:
X = patterns.drop('price_direction', axis=1).values
y = patterns['price_direction'].values

In [27]:
scalar = preprocessing.StandardScaler().fit(X)
with open('models/scalar.pickle', 'wb') as handle:
    pickle.dump(scalar, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [28]:
X_scaled = scalar.transform(X)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [31]:
"""
Model Grid Search
"""
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'ccp_alpha': [0.1, .01, .001],
              'max_depth' : [5, 6, 7, 8, 9],
              'criterion' :['gini', 'entropy']
             }

tree_clas = DecisionTreeClassifier(random_state=1024)

grid_search = GridSearchCV(estimator=tree_clas, param_grid=param_grid, cv=5, verbose=True)

grid_search.fit(X_train, y_train)

final_model = grid_search.best_estimator_
final_model

Fitting 5 folds for each of 90 candidates, totalling 450 fits


DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=7,
                       max_features='auto', random_state=1024)

In [36]:
tree_clas = DecisionTreeClassifier(ccp_alpha=0.001, class_weight=None, criterion='entropy',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, random_state=1024, splitter='best')
tree_clas.fit(X_train, y_train)
y_predict = tree_clas.predict(X_test)

In [37]:
list(tree_clas.predict(X_train)).count(1)

13382

In [38]:
with open('models/decision_tree.pickle', 'wb') as handle:
    pickle.dump(tree_clas, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Model performance

In [39]:
accuracy_score(y_test, y_predict)

0.4550755367081898

# Creation of New Predictions

In [40]:
import pandas as pd

In [87]:
df = sing_day_data('data/STK/1_min/TSLA/20210503.csv')

In [41]:
df = pd.read_csv(f'realtime_data/data.csv', names=['date', 'open', 'high', 'low', 'close', 'volume', 'barCount', 'average'])
df = data_util.pre_data_process(df)

In [42]:
data_patterns = cdl_pattern.generate_pattern(df)

In [43]:
input_data = data_patterns[list_cdls]

In [44]:
input_data = input_data[input_data.any(axis='columns')]

In [45]:
input_data = input_data.values

In [46]:
input_data = scalar.transform(input_data)

In [47]:
output = tree_clas.predict(input_data)

In [48]:
from src.api_call.simulate_api_call import Simulator

In [49]:
sim_obj = Simulator()

In [50]:
sim_obj.get_prediction_data()

array([[0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.13140676, 0.38604144, 0.4825518 ],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.12188789, 0.44531847, 0.43279365],
       [0.

In [105]:
X_new = patterns.iloc[0]

In [108]:
X_new = X_new.drop('price_direction')

In [110]:
X_new = X_new.values

In [112]:
X_new_scale = scaler.transform([X_new])

In [115]:
tree_clas.predict_proba(X_new_scale)

array([[0.89303661, 0.06389088, 0.04307251]])

In [51]:
importance = tree.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

AttributeError: 'DecisionTreeClassifier' object has no attribute 'coef_'

In [None]:
score = pd.DataFrame({'name': list_cdls, 'score': importance})

In [None]:
score.sort_values(['score'], ascending=False, inplace=True)

In [None]:
list_cdls = list(score[score['score'] > 0.1]['name'].values)

In [None]:
patterns.drop_duplicates().shape

In [None]:
# Move the columns to rows
candles = patterns.melt(id_vars=['date', 'price_direction'],
                       var_name="cdl_pattern",
                       value_name="pattern_check")

In [None]:
candles.head()

In [None]:
candles_selected = candles[candles['pattern_check'] != 0]

In [None]:
candles_selected = candles_selected.groupby(['date', 'price_direction'])['cdl_pattern'].unique().reset_index()

In [None]:
candles_selected['cdl_pattern'] = candles_selected['cdl_pattern'].apply(lambda x: ' | '.join(sorted(x)))

In [None]:
candles_selected = candles_selected.groupby(['cdl_pattern', 'price_direction'])['date'].nunique().reset_index()

In [None]:
candles_selected.sort_values(['date'], ascending=False, inplace=True)

In [None]:
candles_selected[candles_selected['price_direction'] != -1]

In [None]:
candles

In [None]:
one_pattern = candles[(candles['cdl_pattern'] == 'CDLDRAGONFLYDOJI') & (candles['pattern_check'] != 0)]

In [None]:
one_pattern = one_pattern['price_direction'].value_counts().reset_index()

In [None]:
total = one_pattern['price_direction'].sum()

In [None]:
one_pattern['frac'] = one_pattern['price_direction']/total

In [None]:
one_pattern

In [None]:
# Find the close price on another n time stamps, the timestamp may vary based on time frame (e.g. 1min, 2 min, 5 min)
# This will help to understand the price movement up or down and by price
# Change in price + or -
# Calculate the close price difference and find the percentage of price change
# df['t+1'] = (((df['close'].shift(-1) - df['close'])/df['close']) * 100).round(2)
# df['t+2'] = (((df['close'].shift(-2) - df['close'])/df['close']) * 100).round(2)
# df['t+3'] = (((df['close'].shift(-3) - df['close'])/df['close']) * 100).round(2)
# df['t+4'] = (((df['close'].shift(-4) - df['close'])/df['close']) * 100).round(2)

# Generate Candle Names

In [None]:
import talib

In [None]:
# This provide the previous day night and current day mid morning day
day = '20210601'
path = f'data/STK/1_min/TSLA/{day}.csv'
df = sing_day_data(path)

In [None]:
all_cdl_patterns = talib.get_function_groups()['Pattern Recognition']

In [None]:
# patterns = ['CDLSHORTLINE']

patterns = ['CDLDRAGONFLYDOJI']

In [None]:
for pattern in patterns:
    df[pattern] = getattr(talib, pattern)(df['open'], df['high'], df['low'], df['close'])

In [None]:
df[df[patterns].any(axis='columns')]

In [None]:
filtered = df[df['CDLDRAGONFLYDOJI'].astype(bool)]

In [None]:
matched_times = df[df['CDLDRAGONFLYDOJI'].astype(bool)]['time']

In [None]:
# Draw the lines for reference
shapes_list = []
annotation_list = []
for time in matched_times:
    shapes_list.append(dict(x0=time, x1=time, y0=0, y1=1, xref='x', yref='paper', line_width=1))
    annotation_list.append(dict(x=time, y=0.05, xref='x', yref='paper', showarrow=False, xanchor='left', text='CDLSHORTLINE'))

# Candle Chart
Draw for single day regarless of dates

In [None]:
fig = go.Figure()

fig.add_trace(go.Candlestick(x=df['time'],
        open=df['open'],
        high=df['high'],
        low=df['low'],
        close=df['close'],
        text=df['close']))

fig.update_layout(
    title=f'Analysis on {day}',
    yaxis_title='Price',
    shapes = shapes_list,
    annotations=annotation_list
)

import plotly.io as pio
pio.renderers.default = 'browser'
pio.show(fig)

# Line Chart

In [None]:
line_data = df[['date', 'time' ,'average', 'open']].copy()

In [None]:
# Find the min open price
min_open_price = line_data['open'].min()

In [None]:
# To bring all open price to 0, subtract open price from average
line_data['avg_nor'] = line_data['average'] - line_data['open']

In [None]:
dates = sorted(line_data['date'].unique())

In [None]:
fig = go.Figure()

for date in dates:
    data = line_data[line_data['date'] == date]
    fig.add_trace(go.Scatter(x=data['time'], y=data['avg_nor'], name = date))

import plotly.io as pio
pio.renderers.default = 'browser'
pio.show(fig)