In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [60]:
from pathlib import Path

# Configurable parts
SYMBOL = "USDJPY"
SAMPLE_TYPE = "time"
MINUTES = 5

START_DATE = "20210101"
END_DATE = "20241231"

# Build base name
RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"
EVENTS_DIR = BASE_DIR / "interm/events"
SIDES_DIR = BASE_DIR / "interm/sides"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_FEATURES.pkl"

## Loading Data

In [61]:
%%time
df = pd.read_pickle(PROCESSED_FILE_PATH)

CPU times: user 1.25 ms, sys: 31.6 ms, total: 32.9 ms
Wall time: 61.7 ms


In [62]:
df.shape

(298164, 67)

In [63]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,close_pct_return,close_return,close_log_return,close_fd_return,...,dom,month,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,month_sin,month_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-07 03:25:00,103.1615,103.167,103.148,103.1505,722520000000.0,0.003158,-9.2e-05,-0.0095,-9.2e-05,1.913549,...,7,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 03:30:00,103.1485,103.159,103.1365,103.1365,867600000000.0,0.00323,-0.000136,-0.014,-0.000136,1.903814,...,7,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 03:35:00,103.1375,103.1475,103.131,103.131,728550000000.0,0.003208,-5.3e-05,-0.0055,-5.3e-05,1.906306,...,7,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 03:40:00,103.1315,103.1335,103.113,103.127,764210000000.0,0.003317,-3.9e-05,-0.004,-3.9e-05,1.907271,...,7,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 03:45:00,103.128,103.132,103.122,103.128,362270000000.0,0.002974,1e-05,0.001,1e-05,1.912092,...,7,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025


## Calculating EMA values

In [64]:
FAST = 9 #(45 min)
SLOW = 20 #(180 min)

In [65]:
SIDE_NAME = f"EMACross_{FAST}_{SLOW}"

In [66]:
import pandas_ta as ta

In [67]:
df.ta.ema(length=FAST, append=True)
df.ta.ema(length=SLOW, append=True)

timestamp
2021-01-07 03:25:00           NaN
2021-01-07 03:30:00           NaN
2021-01-07 03:35:00           NaN
2021-01-07 03:40:00           NaN
2021-01-07 03:45:00           NaN
                          ...    
2024-12-30 23:35:00    156.948166
2024-12-30 23:40:00    156.956293
2024-12-30 23:45:00    156.960027
2024-12-30 23:50:00    156.965024
2024-12-30 23:55:00    156.969165
Name: EMA_20, Length: 298164, dtype: float64

## Finding Cross Points

標記出快線與慢線的相對位置（ema_cross_flag）並找到跨越的時間點（t_events）

In [68]:
df['ema_cross_flag'] = df[f'EMA_{FAST}'] > df[f'EMA_{SLOW}']
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,close_pct_return,close_return,close_log_return,close_fd_return,...,month,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,month_sin,month_cos,ema_cross_flag
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-07 03:25:00,103.1615,103.167,103.148,103.1505,722520000000.0,0.003158,-9.2e-05,-0.0095,-9.2e-05,1.913549,...,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025,False
2021-01-07 03:30:00,103.1485,103.159,103.1365,103.1365,867600000000.0,0.00323,-0.000136,-0.014,-0.000136,1.903814,...,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025,False
2021-01-07 03:35:00,103.1375,103.1475,103.131,103.131,728550000000.0,0.003208,-5.3e-05,-0.0055,-5.3e-05,1.906306,...,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025,False
2021-01-07 03:40:00,103.1315,103.1335,103.113,103.127,764210000000.0,0.003317,-3.9e-05,-0.004,-3.9e-05,1.907271,...,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025,False
2021-01-07 03:45:00,103.128,103.132,103.122,103.128,362270000000.0,0.002974,1e-05,0.001,1e-05,1.912092,...,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025,False


In [69]:
t_events = df.index[df['ema_cross_flag'] != df['ema_cross_flag'].shift(1)]
t_events

DatetimeIndex(['2021-01-07 03:25:00', '2021-01-07 05:00:00',
               '2021-01-07 12:45:00', '2021-01-07 14:20:00',
               '2021-01-07 16:35:00', '2021-01-07 17:00:00',
               '2021-01-07 18:25:00', '2021-01-07 20:05:00',
               '2021-01-07 20:30:00', '2021-01-07 22:30:00',
               ...
               '2024-12-30 01:30:00', '2024-12-30 02:50:00',
               '2024-12-30 04:30:00', '2024-12-30 05:20:00',
               '2024-12-30 07:05:00', '2024-12-30 08:05:00',
               '2024-12-30 09:50:00', '2024-12-30 16:30:00',
               '2024-12-30 18:00:00', '2024-12-30 22:25:00'],
              dtype='datetime64[ns]', name='timestamp', length=13978, freq=None)

## Labeling Sides

黃金交叉與死亡交叉

In [70]:
sides = pd.Series(df['ema_cross_flag'].loc[t_events].apply(lambda x: 1 if x else -1), index=t_events)
sides

timestamp
2021-01-07 03:25:00   -1
2021-01-07 05:00:00    1
2021-01-07 12:45:00   -1
2021-01-07 14:20:00    1
2021-01-07 16:35:00   -1
                      ..
2024-12-30 08:05:00    1
2024-12-30 09:50:00   -1
2024-12-30 16:30:00    1
2024-12-30 18:00:00   -1
2024-12-30 22:25:00    1
Name: ema_cross_flag, Length: 13978, dtype: int64

## Saving to disk

In [71]:
SIDE_FILE_PATH = SIDES_DIR / f"{RESAMPLED_NAME}-{SIDE_NAME}.pkl"
SIDE_FILE_PATH

PosixPath('../data/interm/sides/USDJPY-5m-20210101-20241231-EMACross_9_20.pkl')

In [72]:
sides.to_pickle(SIDE_FILE_PATH)