In [29]:
import pandas as pd
# pd.options.mode.chained_assignment = None
import numpy as np
import joblib
from autoIG.epics import GOLD_EPIC, US_CRUDE_OIL_EPIC
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn import set_config
from autoIG.modelling import create_future_bid_Open
from autoIG.modelling import fillna_,create_past_ask_Open


set_config(transform_output="pandas")


# Persist the data, so we can train on the same dataset each time.
# This should be tracked with DVC
LOAD_DATA = False
SAVE_MODEL = True



def generate_target_2(df,number_of_periods=None):
    return df["BID_OPEN_S3"] / df["ASK_OPEN"]


historical_prices_config = dict()
historical_prices_config["resolution"] = "1Min"
historical_prices_config["numpoints"] = 1_000

if LOAD_DATA:
    from trading_ig.config import config
    from trading_ig.rest import IGService

    ig_service = IGService(config.username, config.password, config.api_key)
    ig = ig_service.create_session()
    results_ = ig_service.fetch_historical_prices_by_epic(
        US_CRUDE_OIL_EPIC, **historical_prices_config
    )
    model_data = results_["prices"]
    model_data.to_pickle("model_data.pkl")
else:
    model_data = pd.read_pickle("model_data.pkl")


In [30]:
# !! DO ALL PREPROCESSING AS PART OF A MODEL PIPELINES SO IT CAN EASILY BE USED IN PROD
# !! The building of the target we not do in pipeline

In [31]:
# we buy at the ask price, we sell at bid price
# We buy at bid, but from then on everything we care
# about it ask, as that is our exit
def adapt_data_for_training(df):
    """
    This takes in the historical data used for training and makes it consistent (column name wise etc).
    With the form of the data being predicted on in production.
    None: This doesnt actually do any of the pre-propcessing steps, 
    this is reserved to the pipeline. However, for the pipeline to take place 
    it needs to be in the right form.
    Furthermore, the creation of the target it something only done in training 
    and therefor is not part of any preprocessing step.
    """
    return df
    
model_data.columns = (
    model_data.columns.get_level_values(0)
    + "_"
    + model_data.columns.get_level_values(1)
)
model_data = model_data[["ask_Open", "bid_Open"]]
model_data = model_data.rename(columns={ "ask_Open": "ASK_OPEN","bid_Open": "BID_OPEN"})
model_data.index.name = 'UPDATED_AT'
model_data = create_future_bid_Open(model_data)
model_data['r'] = generate_target_2(model_data)
model_data = model_data.dropna()
# Another idea for return is, did it go up by x in y periods.
# We are not _only_ interested in the period after
# To chose a specific case, lets see if it went up by 1 in the next 3 periods.


In [32]:


pastperiods_transformer = FunctionTransformer(create_past_ask_Open)
from autoIG.modelling import normalise_
# We chose not to drop because otherwise y is not the same size as X after transform
# drop_na_transformer = FunctionTransformer(lambda df: df.dropna(axis = 'index', how=  'any'))
fillna_transformer  = FunctionTransformer(fillna_ )
normalise_transformer = FunctionTransformer(normalise_)

pl = Pipeline(
    [
        ("add_past_period_columns", pastperiods_transformer),
        ('fill_na', fillna_transformer),
        ('normalise',normalise_transformer),
        ('predictor', LinearRegression())
    ]
)
# pl.transform(model_data)


In [33]:
# Predit the return from the bid price
X = model_data[['ASK_OPEN']]
y = model_data['r']

In [34]:
pl[:-1].transform(X)

Unnamed: 0_level_0,ASK_OPEN,ASK_OPEN_S1,ASK_OPEN_S2,ASK_OPEN_S3
UPDATED_AT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-12-08 19:17:00,1.0,1.000000,1.000000,1.000000
2022-12-08 19:18:00,1.0,0.999805,0.999805,0.999805
2022-12-08 19:19:00,1.0,1.000376,1.000181,1.000181
2022-12-08 19:20:00,1.0,0.999624,1.000000,0.999805
2022-12-08 19:21:00,1.0,0.999221,0.998846,0.999221
...,...,...,...,...
2022-12-08 20:49:00,1.0,0.999021,0.998881,0.998322
2022-12-08 20:50:00,1.0,0.999776,0.998798,0.998658
2022-12-08 20:51:00,1.0,1.000741,1.000518,0.999538
2022-12-08 20:52:00,1.0,0.999678,1.000420,1.000196


In [35]:
# model = LinearRegression()
pl.fit(X,y)

In [36]:
pd.Series(pl.predict(X)).describe()

count    97.000000
mean      0.999422
std       0.000088
min       0.999144
25%       0.999375
50%       0.999418
75%       0.999478
max       0.999671
dtype: float64

In [37]:
# persist the model
if SAVE_MODEL:
    joblib.dump(pl,'model.pkl')
else:
    pl = joblib.load('model.pkl')

In [38]:
pl

In [39]:
pl.feature_names_in_

array(['ASK_OPEN'], dtype=object)