In [28]:
import pandas as pd
from UtilsCreateDataFrame import *
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np
from dataclasses import dataclass, asdict
import matplotlib.pyplot as plt
import numpy_ext as npe
from statsmodels.tsa.stattools import adfuller

In [3]:
input_folder = "H:"
output_folder = "H:"
date = "2021-01"
to_save = True
tickers = ["BTC", "ETH", "ADA", "SOL", "BNB", "DOGE", "XRP"]

df = createDataFrame(
    input_folder=input_folder,
    date=date,
    tickers=tickers,
    frequency="1min",
    n_job=4,
    to_save=to_save,
)

In [111]:
df_ = df.iloc[:, [0, 1]]
df_

Unnamed: 0_level_0,BTC,ETH
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-01 00:00:00+00:00,28961.66,737.02
2021-01-01 00:01:00+00:00,29009.91,738.74
2021-01-01 00:02:00+00:00,28989.30,737.82
2021-01-01 00:03:00+00:00,28982.69,737.43
2021-01-01 00:04:00+00:00,28975.65,736.89
...,...,...
2021-01-31 23:55:00+00:00,33215.89,1316.72
2021-01-31 23:56:00+00:00,33103.66,1315.04
2021-01-31 23:57:00+00:00,33069.47,1313.72
2021-01-31 23:58:00+00:00,33087.33,1312.03


In [114]:
@dataclass
class ResultLR:
    beta: float
    intercept: float
    r2: float
    res_std: float
    res_mean: float
    stationarity_pvalue: float


def p_value_Stationary(x):
    test = adfuller(x)
    p_val = test[1]
    return p_val


def linearRegression_np(x, y):
    """
    function that fits a linear regression and return several objects:
    Inputs:
        -x: ndarray
        -y: ndarray

    Outputs:
        -results: ResultLR

    """
    # create x vector and y vector
    x = x.reshape(-1, 1)
    y = y.reshape(-1, 1)
    # fit the model
    mod = LinearRegression().fit(x, y)
    # predicted values
    pred = mod.predict(x).reshape(-1, 1)
    r2 = r2_score(y_true=y, y_pred=pred)
    residuals = y.flatten() - pred.flatten()
    res_std = np.std(residuals)
    res_mean = np.mean(residuals)
    p_val = p_value_Stationary(residuals)

    result = ResultLR(
        mod.coef_.item(), mod.intercept_.item(), r2, res_std, res_mean, p_val
    )
    return result


def ResultDataFrame(result, index_input):
    """
    function that trasform the result of the rolling operation to a dataframe:
    Inputs:
        -result: list
                list of np.nan and ResultLR objects
        -index_input: Pandas Index
                index of the input dataframe of the rolling operation

    Outputs:
        -df_result: pd.DataFrame

    """

    # index of non na values
    index = ~pd.isnull(result)
    # dict to use for na values
    na_dict = {x: np.nan for x in result[index[0]]}
    # number of na values
    len_na = np.sum(~index)
    na_dict_list = [na_dict for j in range(len_na)]
    df_result = pd.DataFrame.from_records(
        na_dict_list + [asdict(s) for s in result[index]]
    )
    df_result = df_result.set_index(index_input)
    return df_result

In [68]:
# with 1minute frequency a day has 1140 observations
index_input = df_.index
ndf = df_.to_numpy()
last = 2000
rolling = npe.rolling_apply(linearRegression_np, 1140, ndf[:last, 0], ndf[:last, 1])
result = ResultDataFrame(rolling, index_input[:last])
result

Unnamed: 0_level_0,beta,intercept,r2,res_std,res_mean,stationarity_pvalue
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-01 00:00:00+00:00,,,,,,
2021-01-01 00:01:00+00:00,,,,,,
2021-01-01 00:02:00+00:00,,,,,,
2021-01-01 00:03:00+00:00,,,,,,
2021-01-01 00:04:00+00:00,,,,,,
...,...,...,...,...,...,...
2021-01-02 09:15:00+00:00,0.013447,335.928688,0.340006,3.932936,3.590111e-15,0.093883
2021-01-02 09:16:00+00:00,0.013412,336.932843,0.340577,3.925361,-1.647462e-13,0.092234
2021-01-02 09:17:00+00:00,0.013376,337.994244,0.341104,3.917974,-1.715275e-14,0.095835
2021-01-02 09:18:00+00:00,0.013340,339.037872,0.341631,3.910794,4.188462e-14,0.081662


In [108]:
from dataclasses import dataclass, asdict
from typing import List


@dataclass
class ResultStrategy:
    ret_long: List
    ret_short: List
    enter_long: List
    enter_short: List
    exit_long: List
    exit_short: List


def apply_twosigma(
    df_, asset_name_1, asset_name_2, beta, intercept, sigma, start_date, end_date
):
    """
    index of df must be datetime
    version with multiple trades in a trading period

    """

    df = df_.loc[start_date:end_date, :]
    index_ = df.index
    df = pd.DataFrame(
        df.loc[:, asset_name_1] - beta * df.loc[:, asset_name_2] - intercept,
        columns=["spread"],
        index=df.index,
    )

    state = 0  # initial state
    result = ResultStrategy([], [], [], [], [], [])

    for t in df.index:
        if state == 0:
            if df.loc[t, "spread"] > (2 * sigma):
                # here we SHORT
                state = 1
                enter_pos_date = t
                result.enter_long.append(t)

            if df.loc[t, "spread"] < (-2 * sigma):
                # here we go LONG
                state = -1
                enter_pos_date = t
                result.enter_short.append(t)

        elif (state == 1 and df.loc[t, "spread"] <= 0) or (
            state == 1 and t == end_date
        ):
            state = 0
            return_ = (df.loc[enter_pos_date, "spread"] - df.loc[t, "spread"]) / df.loc[
                enter_pos_date, "spread"
            ]
            result.ret_long.append(return_)
            result.exit_long.append(t)

        elif (state == -1 and df.loc[t, "spread"] >= 0) or (
            state == -1 and t == end_date
        ):
            state = 0
            return_ = (
                -(df.loc[enter_pos_date, "spread"] - df.loc[t, "spread"])
                / df.loc[enter_pos_date, "spread"]
            )
            result.ret_short.append(return_)
            result.exit_short.append(t)

    return result

In [109]:
init_date = df.iloc[1996].name
end_date = df.iloc[1996 + 1440].name

res = apply_twosigma(
    df,
    "ETH",
    "BTC",
    beta=0.013447,
    intercept=335.928688,
    sigma=3.932936,
    start_date=init_date,
    end_date=end_date,
)

In [110]:
res

ResultStrategy(ret_long=[1.0278503237056664, 1.1012501529479342, 1.057788275231827, 1.0487803151621249, 1.0868586480569309, -1.0693080512227606], ret_short=[-1.0729041984795125], enter_long=[Timestamp('2021-01-02 13:13:00+0000', tz='UTC'), Timestamp('2021-01-02 15:35:00+0000', tz='UTC'), Timestamp('2021-01-02 16:58:00+0000', tz='UTC'), Timestamp('2021-01-02 20:59:00+0000', tz='UTC'), Timestamp('2021-01-03 06:18:00+0000', tz='UTC'), Timestamp('2021-01-03 08:35:00+0000', tz='UTC')], enter_short=[Timestamp('2021-01-02 09:50:00+0000', tz='UTC')], exit_long=[Timestamp('2021-01-02 14:29:00+0000', tz='UTC'), Timestamp('2021-01-02 16:14:00+0000', tz='UTC'), Timestamp('2021-01-02 18:06:00+0000', tz='UTC'), Timestamp('2021-01-03 01:44:00+0000', tz='UTC'), Timestamp('2021-01-03 07:48:00+0000', tz='UTC'), Timestamp('2021-01-03 09:16:00+0000', tz='UTC')], exit_short=[Timestamp('2021-01-02 12:38:00+0000', tz='UTC')])

In [115]:

def apply_twosigma_v2(
    df_, asset_name_1, asset_name_2, beta, intercept, sigma, start_date, end_date
):
    """
    index of df must be datetime
    updated version with only one trade for trading period 

    """

    df = df_.loc[start_date:end_date, :]
    index_ = df.index
    df = pd.DataFrame(
        df.loc[:, asset_name_1] - beta * df.loc[:, asset_name_2] - intercept,
        columns=["spread"],
        index=df.index,
    )

    state = 0  # initial state
    result = ResultStrategy([], [], [], [], [], [])

    for t in df.index:
        if state == 0:
            if df.loc[t, "spread"] > (2 * sigma):
                # here we SHORT
                state = 1
                enter_pos_date = t
                result.enter_long.append(t)

            if df.loc[t, "spread"] < (-2 * sigma):
                # here we go LONG
                state = -1
                enter_pos_date = t
                result.enter_short.append(t)

        elif (state == 1 and df.loc[t, "spread"] <= 0) or (
            state == 1 and t == end_date
        ):
            state = 0
            return_ = (df.loc[enter_pos_date, "spread"] - df.loc[t, "spread"]) / df.loc[
                enter_pos_date, "spread"
            ]
            result.ret_long.append(return_)
            result.exit_long.append(t)
            return result

        elif (state == -1 and df.loc[t, "spread"] >= 0) or (
            state == -1 and t == end_date
        ):
            state = 0
            return_ = (
                -(df.loc[enter_pos_date, "spread"] - df.loc[t, "spread"])
                / df.loc[enter_pos_date, "spread"]
            )
            result.ret_short.append(return_)
            result.exit_short.append(t)
            return result

    return result

In [116]:
init_date = df.iloc[1996].name
end_date = df.iloc[1996 + 1440].name

res = apply_twosigma_v2(
    df,
    "ETH",
    "BTC",
    beta=0.013447,
    intercept=335.928688,
    sigma=3.932936,
    start_date=init_date,
    end_date=end_date,
)

In [117]:
res

ResultStrategy(ret_long=[], ret_short=[-1.0729041984795125], enter_long=[], enter_short=[Timestamp('2021-01-02 09:50:00+0000', tz='UTC')], exit_long=[], exit_short=[Timestamp('2021-01-02 12:38:00+0000', tz='UTC')])