# Claudio: Explainer

In [None]:
from datetime import datetime, time, date, timedelta
import pandas as pd
import sys
import os
CWD = os.getcwd()
SL = '/' if CWD[0] == '/' else '\ '[0]
WD = CWD.split('Main')[0] + 'Main' + SL
DFNC = WD + 'Functions'
sys.path.insert(1, DFNC)

from Claudio_Main import *
from Claudio_Chart_Examples import *


## Pattern Data Overview
- The data I am analyzing is comprised of 15 different technical patterns

In [None]:
pattern_codes = patterns_source(FETCH=False,DF='codes')
print(pattern_codes[['pcode','pname',]])

In [None]:
chart_data,chart_results = chart_recent(PUID='DB_202003121145_5T_ADM',CHART=500)


- Each found pattern has a row of data like the below example from a "Double Bottom" (DB)
- There are between 300 and 400 column attributes for each row

In [None]:
chart_results

- And those rows are separated by PCODE saved to separate CSV Summery Files in the "Patterns_All" folder

In [None]:
PCODE = 'DB'
CSV_PCODE = f'{WD}Sources{SL}aggs{SL}5T{SL}Results{SL}Patterns_All{SL}{PCODE}.csv'
data_pcode = csv_from(CSV_PCODE,LM=True)
print(f"\nThe 'Double Bottom' summery file, for example, has {len(data_pcode):,.0f} rows (and {data_pcode.shape[1]} attributes) between {data_pcode['date'].min()} and {data_pcode['date'].max()}\n")

data_pcode

- Out of the 300 to 400 columns for each pattern (PCODE):
  - some are support columns (i.e. pattern found time, data fetch time, etc), 
  - some are future biased results (i.e. return, exit datetime, etc), 
  - and most are technical backwards looking attributes that I am trying to create predictions from (i.e. open, high, low, volume, rsi, etc)

In [None]:
pattern_cols = patterns_source(FETCH=False,DF='cols')
print(f"SUPPORT COLUMNS\n{pattern_cols[['column',]][(pattern_cols['category'] == 'bars') & (pattern_cols['w_col'] == False)].head(4)}\n")
print(f"FUTURE BIAS COLUMNS\n{pattern_cols[['column',]][(pattern_cols['category'] == 'ptrn') & (pattern_cols['bias'] == True)].head(6)}\n")
print(f"TECHNICAL UN-BIASED COLUMNS\n{pattern_cols[['column','rtype','rel_from']][(pattern_cols['w_col'] == True) & (pattern_cols['bias'] == False)].head(6)}\n")


- The raw PCODE data also includes failed patterns (i.e. data['filled'] == 0), and patterns that filled, but would be infeasable to actually execute a trade with (data['otf_minutes'] == 0) where 'otf_minutes' means "order to fill minutes").
- I decided to leave these rows in the dataset in case they are helpful in feature selection.
- I use the function primary_filter() to filter the data down to something closer to what I could practically execute a trade with.


In [None]:
filtered = primary_filter(data_pcode,PRINT=True,FLD_ANY=True,FLD_SAME_DAY=False,OTF_MINUTES_MIN=0)
filtered.head(3)

- Example of <b>Unfilled</b> "Double Bottom" (DB) Pattern (data['filled'] == 0)

In [None]:
chart_data,chart_results = chart_recent(PUID='DB_202003161215_5T_A',CHART=500)


- Example of <b>Filled</b> "Double Bottom" (DB) Pattern with a <b>Negative Return</b>


In [None]:
chart_data,chart_results = chart_recent(PUID='DB_202003161115_5T_A',CHART=500)


- Example of <b>Filled</b> "Double Bottom" (DB) Pattern with a <b>Positive Return</b>


In [None]:
chart_data,chart_results = chart_recent(PUID='DB_202003121145_5T_ADM',CHART=500)


## Backtesting Timeframe Methodology
- My overall backtesting process is to look at 1-month timeframe against the previous 12-months.


In [None]:
LOOKBACK_DATES = lookback_df(LB_MONTHS=12,NEXT=False)
print(LOOKBACK_DATES[['month_id','lookback_start','lookback_end','backtest_start','backtest_end']][1:4])

- The "Period_Data" Folder is comprised of all "PCODE" data appended together and separated by MONTH_ID's to make it easier to run my backtesting by MONTH_ID

In [None]:
MONTH_ID = '2022_10'
LB_ID = LOOKBACK_DATES[(LOOKBACK_DATES["month_id"] == MONTH_ID)].iloc[0]
CSV_PERIOD = f'{WD}Sources{SL}aggs{SL}5T{SL}Results{SL}Period_Data{SL}{MONTH_ID}.csv'
data_period = csv_from(CSV_PERIOD,LM=True)

print(f"\nThe below 'data_period' has a date range between {LB_ID.backtest_start.strftime('%Y-%m-%d')} and {LB_ID.backtest_end.strftime('%Y-%m-%d')}")
print(f"so it would use data between {LB_ID.lookback_start.strftime('%Y-%m-%d')} and {LB_ID.lookback_end.strftime('%Y-%m-%d')} to create predictions from sklearn\n")

data_period[['pcode','pname','date','filled','return']]


## Current Feature Selection Process
#### feature_selection_loop()
- The current way I go about selecting the ideal features for making pattern predictions (which I believe has a lot of room for improvement) is through the function <b>feature_selection_loop()</b>
- The gists of the function is to loop through the different lookback timeframes, and then for each timeframe it loops through the patterns by 'pcode' and attempts to find the ideal attribute columns for each pattern
- It then saves that data to a separate CSV file for future backtesting


#### Step 1 :: Create Looping Route

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#def feature_selection_loop(OFFSET=None,AGG='5T',MAX_MO_IDS=26,MO_IDS=[],MO_ID_IS=None,PCODE_LST=[]):
OFFSET=None
AGG='5T'
MAX_MO_IDS=100
MO_IDS=[]
MO_ID_IS='2022_10' # <<<< I'm limiting it to just this MO_ID for the sake of this example
PCODE_LST=['DB'] # <<<< I'm limiting it to just this PCODE for the sake of this example

#### Get Possible MO_IDS
#################################################
if MO_ID_IS != None:
    MO_IDS = lookback_df(LB_MONTHS=12,NEXT=True)
    MO_IDS = MO_IDS[(MO_IDS["month_id"] == MO_ID_IS)]
elif len(MO_IDS) == 0:
    PERIOD_DATA_FOLDER = f'{WD}Sources{SL}aggs{SL}{AGG}{SL}Results{SL}Period_Data'
    files = glob.glob(f"{PERIOD_DATA_FOLDER}{SL}*.csv")
    if len(files) > 0:
        MO_IDS = pd.DataFrame(files)
        MO_IDS['month_id'] = MO_IDS[0].astype(str)
        MO_IDS['month_id'] = MO_IDS['month_id'].str.split('Period_Data', expand=True)[1].str.split('.csv', expand=True)[0].str[1:]
        MO_IDS = MO_IDS[['month_id']]
MO_IDS = MO_IDS.sort_values(by=['month_id',], ascending=[False]).reset_index(drop=True)
MO_IDS = MO_IDS.head(MAX_MO_IDS)

#### Get Lookback Dates
#################################################
LB_12 = lookback_df(LB_MONTHS=12,NEXT=True)
LB_24 = lookback_df(LB_MONTHS=24,NEXT=True)
LB_12 = LB_12[['month_id','lookback_id','lookback_start','lookback_end']]
LB_24 = LB_24[['month_id','lookback_id','lookback_start','lookback_end']]
LB_12.rename(columns={'lookback_start': 'LB_12_START','lookback_end': 'LB_12_END','lookback_id': 'LB_12_ID',}, inplace=True)
LB_24.rename(columns={'lookback_start': 'LB_24_START','lookback_end': 'LB_24_END','lookback_id': 'LB_24_ID',}, inplace=True)
LOOKBACK_DATES = LB_12.merge(LB_24, how = 'inner', on = ['month_id'])
LOOKBACK_DATES = LOOKBACK_DATES.merge(MO_IDS, how = 'inner', on = ['month_id'])
LOOKBACK_DATES = set_offset(LOOKBACK_DATES,OFFSET)

#### Get All Pattern Codes (PCODES)
#################################################
codes = patterns_source(DF='codes')
ALL_CODES = list(codes["pcode"].unique())
ALL_CODES = PCODE_LST if len(PCODE_LST) >0 else ALL_CODES

#### Itterate through Lookback Dates
#################################################
count_total = len(LOOKBACK_DATES)
count = 0
START_time = datetime.now()
print(f"\n\n{'='*100}\n{datetime.now():%I:%M:%S} | Feature Selection Started for {len(LOOKBACK_DATES)} Periods\n{'='*100}\n")

for index, row in LOOKBACK_DATES.iterrows():
    MONTH_ID = row["month_id"]
    LB_12_START = pd.to_datetime(row["LB_12_START"]).strftime('%Y-%m-%d')
    LB_12_END = pd.to_datetime(row["LB_12_END"]).strftime('%Y-%m-%d')
    LB_24_START = pd.to_datetime(row["LB_24_START"]).strftime('%Y-%m-%d')
    LB_24_END = pd.to_datetime(row["LB_24_END"]).strftime('%Y-%m-%d')
    
    #### Itterate through Pattern Codes (PCODES)
    #################################################
    for PCODE in ALL_CODES:  # <<<< Limiting it to one PCODE for this example
        
        ### PCODE Variables/Settings
        #######################################
        print(f"{datetime.now():%I:%M:%S} | {MONTH_ID} | {PCODE} \t Started...")


#### Step 2 :: Load & Filter PCODE Data by 24-Month Data Range
- After loading the PCODE data I then cut_by_dates() to filter it to the previous 24-month period.
- I typically try to use a 12-month lookback period, but I've found that for this step in the process, 24 month of data seems to perform better


In [None]:
### PCODE Variables/Settings
#######################################
print(f"{datetime.now():%I:%M:%S} | {MONTH_ID} | {PCODE} \t Started...")
CSV_IMPORT_PCODE = f'{WD}Sources{SL}aggs{SL}{AGG}{SL}Results{SL}Patterns_All{SL}{PCODE}.csv'
CSV_EXPORT_PCODE = f'{WD}Sources{SL}aggs{SL}{AGG}{SL}Results{SL}Attributes{SL}Data{SL}{MONTH_ID}{SL}{PCODE}.csv'

### Load PCODE Data & Filter to Previous 24-Month Period
#######################################
data = csv_from(CSV_IMPORT_PCODE,LM=False)
dates_shape_pre = data.shape
data = cut_by_dates(data,CUT_COL='date',CUT_START=LB_24_START,CUT_END=LB_24_END,RESET_INDEX=True)
dates_shape_post = data.shape
print(f"{datetime.now():%I:%M:%S} | {MONTH_ID} | {PCODE} \t Cut-by-Dates \t\t {dates_shape_pre} > {dates_shape_post} | {LB_24_START} > {LB_24_END}")

data

#### Step 3 :: Filter PCODE Data by Patterns that Filled
- Then I apply the primary_filter() to mostly filter it down the patterns that filled
- I had previously assumed that filtering it even further to something that more identically resembled what I could practically trade on would be better, but in my tests it performed worse. I am hoping to either understand why that is or to have your solution filter it more accurately.


In [None]:
filter_shape_pre = data.shape
data = primary_filter(data,PRINT=False,FLD_ANY=True,FLD_SAME_DAY=False,OTF_MINUTES_MIN=0,OTF_MINUTES_MAX=None,RTR_MIN=0,FORCE_EOD=False,GAP_MIN=None,GAP_MAX=None,TIME_HM_MAX=None)
filter_shape_post = data.shape
print(f"{datetime.now():%I:%M:%S} | {MONTH_ID} | {PCODE} \t Primary Filter \t {filter_shape_pre} > {filter_shape_post}")


#### Step 4 :: Filter PCODE Data by pattern_data_sklearn_prep()
- Before running the <b>pattern_data_sklearn_prep()</b> I set the 'TARGET_COL' (or y value) to 'return', which will be the only future-bias column going forward.
- Then inside of the <b>pattern_data_sklearn_prep()</b> function the first step is to load the <b>unbiased_pattern_attributes(PCODE=PCODE)</b>
- It then creates a new dataframe based on the atb_df columns 'rtype' and 'rel_from'
   - If the 'rtype' == 'self' then it just copies the values in hte PCODE data for 'column'
      - i.e. 'rsi_14' = 'rsi_14
   - If the 'rtype' != 'self' then it divides the PCODE column data by the PCODE 'rel_from' data, creating a 'relative attribute column'
      - i.e. 'volume' = ('volume'/'r20vma')
 

In [None]:
atb_df = unbiased_pattern_attributes(PCODE=PCODE)
print(f"\nUNBIASED COLUMNS: Non-Relative: {len(atb_df[(atb_df['rtype'] == 'self')])} total\n{atb_df[(atb_df['rtype'] == 'self')][4:6]}")
print(f"\nUNBIASED COLUMNS: Yes-Relative: {len(atb_df[(atb_df['rtype'] != 'self')])} total\n{atb_df[(atb_df['rtype'] != 'self')][3:5]}")


 - The next step is to filter the data down to the top 'COR_MAX_COLS' columns (I've been using 100 as the value) with the greated correlation (which is just to help speed up the next step in the process which is backwards_elimination()
 - For the sake of this Jupyter example I'm limiting it to the top 10 just to make it faster
 - I also keep the 'date' column in here so that I can cut_by_date later, but it will be ignored in sklearn.

In [None]:
### Prep PCODE Data for SKLEARN
#######################################
TARGET_COL = 'return' # <<<< This is the ONLY future-bias column and will serve as the 'y' value
MAX_COLS = 10 # <<<< Referrrs to the max number of columns sorted by correlation (typically set to 100)
sklearn_prep_shape_pre = data.shape
data = pattern_data_sklearn_prep(data,TARGET_COL=TARGET_COL,PCODE=PCODE,COR_MAX_COLS=MAX_COLS,PRINT=False)
sklearn_prep_shape_post = data.shape
print(f"{datetime.now():%I:%M:%S} | {MONTH_ID} | {PCODE} \t Sklearn Data Prep \t {sklearn_prep_shape_pre} > {sklearn_prep_shape_post}")

data

#### Step 5 :: Filter PCODE Data by backward_elimination()
- Next I run the backward_elimination() Function to try and select the best attributes

In [None]:
### Get Columns via Backward Elimination
#######################################
be_shape_pre = data.shape
BE_COLS,drop_df = backward_elimination(data,TARGET_COL=TARGET_COL,EXCLUDE_COLS=['date'],PRINT=True)
EXPORT_DROP_DF = f'{WD}Sources{SL}aggs{SL}{AGG}{SL}Results{SL}Attributes{SL}Drops{SL}{MONTH_ID}{SL}{PCODE}.csv'
csv_to(drop_df,EXPORT_DROP_DF)
PDATA_COLS = ['date',TARGET_COL] + BE_COLS
data = data[PDATA_COLS]
be_shape_post = data.shape
print(f"{datetime.now():%I:%M:%S} | {MONTH_ID} | {PCODE} \t Backwards Elimination \t {be_shape_pre} > {be_shape_post}")

data

#### Step 6 :: Cut-by-Dates Again to get down to 12-Month Period
- Lastly I cut it down to the last 12-month period since this is what I use for all of my backtesting.

In [None]:
### Filter Data to to Previous 12-Month Period
#######################################
dates_shape_pre = data.shape
data = cut_by_dates(data,CUT_COL='date',CUT_START=LB_12_START,CUT_END=LB_12_END,RESET_INDEX=True)
dates_shape_post = data.shape
print(f"{datetime.now():%I:%M:%S} | {MONTH_ID} | {PCODE} \t Cut-by-Dates \t\t {dates_shape_pre} > {dates_shape_post} | {LB_12_START} > {LB_12_END}")
data = data.sort_values(by=['date',], ascending=[True]).reset_index(drop=True)

data

#### Step 7 :: Export Attribute Data
- Finally, for each PCODE in ALL_CODES, it exports the selected feature data.
- Note that if you are testing this with changes, it won't export the file if it already exists so just delete it before running.

In [None]:
### Export Final Data
#######################################
if os.path.exists(CSV_EXPORT_PCODE):
    print(f"{datetime.now():%I:%M:%S} | {MONTH_ID} | {PCODE} \t NOT Exported because File Already Exists!\n")
else:
    csv_to(data,CSV_EXPORT_PCODE)
    print(f"{datetime.now():%I:%M:%S} | {MONTH_ID} | {PCODE} \t Features Export Final \t {data.shape}\n")
    

## get_weights() Function
- At the end of each timeframe inside of the <b>feature_selection_loop()</b> it runs the <b>get_weights()</b> function, which essentially applies the historical feature selection data to to current data (see example below)

In [None]:
LOOKBACK_DATES = lookback_df(LB_MONTHS=12,NEXT=False)
print(LOOKBACK_DATES[['month_id','lookback_start','lookback_end','backtest_start','backtest_end']].iloc[1])

- For example, the month_id "2022_10" data with a date range between "backtest_start" (2022-10-01) and  "backtest_end" (2022-10-31) will create a prediction from the  <b>feature_selection_loop()</b> data from date ranges between "lookback_start" (2021-10-01) and  "lookback_end" (2022-09-30)
- The reason I separated them out like this and used local file lookups is to make it run faster on my LIVE script, which takes pattern data from the last five minutes and tries to analyze it to determine whether or not to execute a trade.

- The first step in the <b>get_weights()</b> function is to load the "present-tense" data from the "Period_Data" folder

In [None]:
HIST_MONTH = '2022_10'
CSV_BT_DATA = f'{WD}Sources{SL}aggs{SL}{AGG}{SL}Results{SL}Period_Data{SL}{HIST_MONTH}.csv'
results = csv_from(CSV_BT_DATA,LM=False)
results.head(3)

- Then in the <b>get_weights()</b> >> <b>prep_weight_data()</b> it will remove the "return" column (since it should be blind to that) and create a new X values from the columns exported in the <b>feature_selection_loop()</b> 

In [None]:
#### Get Lookback Data
#################################################
LB_DATA_CSV = f'{WD}Sources{SL}aggs{SL}{AGG}{SL}Results{SL}Attributes{SL}Data{SL}{MONTH_ID}{SL}{PCODE}.csv'
LB_DATA = csv_from(LB_DATA_CSV)
LB_DATA.head(3)

In [None]:
LB_DATA_COLS = LB_DATA.copy()
LB_DATA_COLS = LB_DATA_COLS.drop(['date','return',], axis=1, errors='ignore')
LB_DATA_COLS = LB_DATA_COLS.head(1).T.reset_index()
LB_DATA_COLS.rename(columns={'index': 'column',}, inplace=True)
LB_DATA_COLS = LB_DATA_COLS[['column']]
LB_DATA_COLS.head(10)

- Then in the <b>get_weights()</b> >> <b>calculate_weights()</b> it will use <b>sklearn LinearRegression()</b> to create a prediction, which I call the <b>'weight'</b>
- It will also create <b>'w_score'</b> column, which is essentially an easier to read score of the weight (between 0 and 100).
- Then it will export the results with the 'weight' and 'w_score' to CSV, which will become the source data for my backtesting.

In [None]:
LB_ID = MONTH_ID + '_m12'
WEIGHT_EXPORT_CSV = f'{WD}Sources{SL}aggs{SL}{AGG}{SL}Results{SL}Pattern_Weights{SL}{LB_ID}.csv'
weights = csv_from(WEIGHT_EXPORT_CSV)

weights[['datetime','pcode','puid','weight','w_score']]

## backtest_loop_simple() Function
- Then in the <b>backtest_loop_simple()</b> function is what I use to apply my current <b>Trading Strategy</b> to the <b>Pattern_Weights data</b> created in the <b>get_weights()</b> function
- My <b>Trading Strategy</b> is essentially to:
  1. Create a "POSSIBLES" dataframe of trades that have a minimum weight value, along with other filtering criteria from the primary_filter() function.
  2. Then it sorts those "POSSIBLES" by 'datetime', and then by 'w_score' and goes "all-in" on the next best opportunity.
  3. If that trade does not trigger either a 'target' or 'loss' by the end of the day, then it exits at the end of the day at the close price.
  4. As soon as it exits a trade it immedialty looks for the next "POSSIBLE" and enters as soon as one hits the filtered criteria.


<b>Below Print Stats Key</b>
- PL/Day = Average Profit Percent Per Day
- PL/Period = Average Profit Percent Per Period (month)
- TPD = Trades Per Day
- PPD = Possibles Per Day
- P_Mean = Average Profit Percent Per Possible
- Wins = Percent of Trades with positive profit

In [None]:
POSSIBLES,TRADES,STATS,DAYS = backtest_loop_simple(AGG_LIST=['5T'],BT_LOOPS=6,MO_IDS=[],MO_ID_IS=None,CAP_EXTREMES=False)


In [None]:
print(f"Over the above period of {len(DAYS):,.0f} days there were {len(POSSIBLES):,.0f} Possible Trades\n")
POSSIBLES[['datetime','otf_minutes','filled_dt','exit_dt','symbol','pcode','weight','w_score','pos','entry','loss','target','rtr','return','pl_pct',]]


In [None]:
print(f"And the Strategy executed {len(TRADES):,.0f} Trades for an average of {STATS['PLPCT_DAY'].iloc[-1]:.2f}% profit/day\n")
TRADES[['datetime','otf_minutes','filled_dt','exit_dt','symbol','pcode','weight','w_score','pos','entry','loss','target','rtr','return','pl_pct',]]
