# Trading strategy

1. Distinguish industry category.
2. Cointegration test, the p-value of the following three tests must below 0.05
    - Augmented Dickey Fuller
    - Phillips-Perron
    - Kwiatkowski-Phillips-Schmidt-Shin
3. Calculate the hedge ratio through linear regression
    - It should be rolling window.
    - Using simple split right now.
4. Backtest for five years for 2021.

# Import packages

In [24]:
import finlab
# Read the API key from the text file
with open('credential.txt', 'r') as file:
    api_key = file.readline().strip()

# Use the API key to log in
finlab.login(api_key)

輸入成功!


In [25]:
from finlab import data
from finlab.backtest import sim

In [26]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS

import warnings

# Function

### Calculate hedge ratio.

In [27]:
def calculate_hedge_ratio(stock1_prices, stock2_prices):
    # Align the data to ensure both series have the same length
    stock1_prices, stock2_prices = stock1_prices.align(stock2_prices, join='inner')
    
    # Remove NaN and infinite values
    valid_data = pd.DataFrame({'stock1': stock1_prices, 'stock2': stock2_prices})
    valid_data = valid_data.replace([np.inf, -np.inf], np.nan).dropna()

    # Extract cleaned prices
    clean_stock1_prices = valid_data['stock1']
    clean_stock2_prices = valid_data['stock2']

    # Perform linear regression: stock1_prices ~ stock2_prices
    model = OLS(clean_stock1_prices, sm.add_constant(clean_stock2_prices)).fit()

    # The hedge ratio is the slope of the regression line
    hedge_ratio = model.params[1]

    return hedge_ratio

### Calculate spread and std.

In [28]:
def compute_spread_and_std(stock1_prices, stock2_prices, hedge_ratio, windows= 20):
    stock1_prices, stock2_prices = stock1_prices.align(stock2_prices, join='inner')
    valid_data = pd.DataFrame({
        'stock1': stock1_prices,
        'stock2': stock2_prices
        # 'hedge_ratio': hedge_ratio
    })

    # Remove NaN and infinite values
    valid_data = valid_data.replace([np.inf, -np.inf], np.nan).dropna()

    valid_data['spread'] = valid_data['stock1'] - hedge_ratio * valid_data['stock2']
    
    # Calculate the mean and the standard deviation over the years
    valid_data['mean'] = valid_data['spread'].rolling(window= windows).mean()
    valid_data['rolling_std'] = abs(valid_data['spread']).rolling(window = windows).std()
    
    # Return a DataFrame with spread and rolling standard deviation
    return valid_data

### Generate Position, Entry and Exit Signals.

In [29]:
def generate_position(valid_data, stock1, stock2, hedge_ratio, threshold):
    # Define entry signals: True if signal indicates an entry (spread > threshold or < -threshold)
    valid_data['long_signal'] = valid_data['spread'] < valid_data['mean'] - valid_data['rolling_std'] * threshold
    valid_data['short_signal'] = valid_data['spread'] > valid_data['mean'] + valid_data['rolling_std'] * threshold # stock 1 price is overvalued to stock 2 price, short stock 1 and long stock 2

    # Initialize positions
    valid_data['position_stock1'] = 0
    valid_data['position_stock2'] = 0

    # If stock1 is overvalued, short stock1 and long stock2
    valid_data.loc[valid_data['short_signal'], 'position_stock1'] = -1
    valid_data.loc[valid_data['short_signal'], 'position_stock2'] = hedge_ratio

    # If stock1 is undervalued, long stock1 and short stock2
    valid_data.loc[valid_data['long_signal'], 'position_stock1'] = 1
    valid_data.loc[valid_data['long_signal'], 'position_stock2'] = -hedge_ratio

    # Keep only relevant columns
    position = valid_data[['position_stock1', 'position_stock2']]
    position = position.rename(columns={'position_stock1': stock1, 'position_stock2': stock2})
    
    return position

# 主程式碼

## Fetch data

In [32]:
close=data.get("price:收盤價")
close_df = pd.DataFrame(close)

In [33]:
close_df

symbol,0015,0050,0051,0052,0053,0054,0055,0056,0057,0058,...,9944,9945,9946,9949,9950,9951,9955,9958,9960,9962
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-04-23,9.54,57.85,32.83,38.40,,,,,,,...,31.00,30.35,9.78,73.60,13.25,51.3,72.40,,46.00,49.60
2007-04-24,9.54,58.10,32.99,38.65,,,,,,,...,32.00,30.50,9.91,75.00,13.25,50.5,71.60,,45.90,50.40
2007-04-25,9.52,57.60,32.80,38.59,,,,,,,...,32.30,29.50,10.10,74.60,13.30,49.9,71.60,,49.10,49.10
2007-04-26,9.59,57.70,32.80,38.60,,,,,,,...,31.60,29.15,10.80,74.50,13.25,49.5,71.00,,48.90,48.00
2007-04-27,9.55,57.50,32.72,38.40,,,,,,,...,31.40,28.50,11.55,75.70,13.15,48.8,69.50,,48.60,46.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-30,,183.95,81.60,177.10,97.6,,29.35,38.66,134.50,,...,20.40,48.95,28.75,25.50,14.35,65.2,36.40,222.0,,16.95
2024-10-01,,183.60,81.60,179.05,98.7,,29.25,38.57,136.25,,...,20.50,48.25,28.35,25.50,14.25,65.2,36.20,229.5,26.25,16.55
2024-10-04,,183.60,80.30,179.25,98.5,,29.05,38.35,135.70,,...,20.60,47.80,26.95,25.00,14.20,64.8,37.00,223.0,26.25,16.55
2024-10-07,,188.15,81.55,184.15,100.3,,29.45,38.64,138.40,,...,20.65,48.45,27.50,24.95,14.20,64.8,36.45,223.5,26.40,16.35


## Backtest

In [36]:
warnings.filterwarnings("ignore")

# Initialize an empty DataFrame to store positions for all years with the full date range
every_year_position = pd.DataFrame(index=close_df.index)
every_year_position.index.name = 'Date'  # Set index name for consistency

for YEAR in range(2019, 2024):
    print(f"Year: {YEAR}")

    # Read chosen pairs for the year and filter by p-value conditions
    chosen_pairs_df = pd.read_excel(f'final_chosen_pairs_{YEAR}.xlsx')
    chosen_pairs_df = chosen_pairs_df.astype({'stock1': 'str', 'stock2': 'str'})
    chosen_pairs_df = chosen_pairs_df[
        (chosen_pairs_df['adfuller_p_value_1'] < 0.05) & 
        (chosen_pairs_df['adfuller_p_value_2'] < 0.05) & 
        (chosen_pairs_df['pp_p_value_1'] < 0.05) & 
        (chosen_pairs_df['pp_p_value_2'] < 0.05) & 
        (chosen_pairs_df['kpss_p_value_1'] < 0.05) & 
        (chosen_pairs_df['kpss_p_value_2'] < 0.05)
    ]

    # Set the test start and end dates for the year
    test_start_date = f'{YEAR - 1}-12-20'
    test_end_date = f'{YEAR}-12-31'
    
    # Initialize final_position with zeros to cover the entire date range for the current year
    final_position = pd.DataFrame(index=pd.date_range(start=test_start_date, end=test_end_date))
    final_position.index.name = 'Date'

    # Generate signals for each pair
    for index, row in chosen_pairs_df.iterrows():
        stock1 = str(row['stock1'])
        stock2 = str(row['stock2'])

        # Slice the data within the date range
        year_close_df = close_df.loc[test_start_date:test_end_date]

        stock1_prices = year_close_df[stock1]
        stock2_prices = year_close_df[stock2]

        # Calculate the hedge ratio
        hedge_ratio = calculate_hedge_ratio(stock1_prices, stock2_prices)

        # Compute the spread and rolling standard deviation
        spread_df = compute_spread_and_std(stock1_prices, stock2_prices, hedge_ratio)
        spread_df.dropna(subset=['rolling_std'], inplace=True)

        # Generate trading signals
        position = generate_position(spread_df, stock1, stock2, hedge_ratio, 1)  # threshold = 1

        # Ensure the new positions align with final_position's index, filling missing dates with zeros
        position = position.reindex(final_position.index, fill_value=0)

        # Add positions to final_position, summing if columns already exist
        for col in position.columns:
            if col in final_position.columns:
                final_position[col] += position[col]
            else:
                final_position[col] = position[col]

    # Add final_position for the current year to every_year_position, summing on overlapping dates
    every_year_position = every_year_position.add(final_position, fill_value=0)

# Replace NaNs with 0 (if any remain)
every_year_position.fillna(0, inplace=True)

print(every_year_position)

Year: 2019
Year: 2020
Year: 2021
Year: 2022
Year: 2023
            1102  1103  1104  1108  1201  1203  1210  1215  1216  1217  ...  \
Date                                                                    ...   
2007-04-23   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2007-04-24   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2007-04-25   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2007-04-26   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2007-04-27   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
...          ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
2024-09-30   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2024-10-01   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2024-10-04   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2024-10-07   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ... 

In [37]:
every_year_position

Unnamed: 0_level_0,1102,1103,1104,1108,1201,1203,1210,1215,1216,1217,...,9942,9943,9944,9945,9946,9950,9951,9958,9960,9962
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-04-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2007-04-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2007-04-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2007-04-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2007-04-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-10-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-10-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-10-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# Check for duplicate date indices
duplicates_in_index = every_year_position.index.duplicated()
if duplicates_in_index.any():
    print("Duplicate indices found:")
    print(every_year_position.index[duplicates_in_index])
else:
    print("No duplicate indices found.")

No duplicate indices found.


## Run Sim

In [None]:
every_year_position

In [39]:
# set time range from 2019-12-20 to 2023-12-31
every_year_position = every_year_position.loc['2020-01-01':'2023-12-31']

In [44]:
ret = sim(every_year_position, stop_loss= 0.1, market= 'TW_STOCK', resample= 'Q')