In [None]:
%cd /app

import os
import pandas as pd
import datetime
import numpy as np
from pandas import IndexSlice as idx
import matplotlib.pyplot as plt

In [None]:
price_df = pd.read_csv("data/price_data.csv", parse_dates=True, index_col=0)
balance_df = pd.read_csv("data/balance_data.csv", parse_dates=True, index_col=0)
macro_df = pd.read_csv("data/macro.csv", parse_dates=True, index_col=0)
cot_df = pd.read_csv("data/mm_positioning.csv", parse_dates=True, index_col=[0,1])

combined_raw_df = pd.concat(
    [
        price_df,
        balance_df,
        macro_df,
        cot_df.loc[idx[:, 'wti'], :].droplevel(1, axis=0)
    ],
    axis=1,
    join="outer",

)

cot_df_columns = cot_df.columns.to_list()
balance_df_columns = balance_df.columns.to_list()

# interpolations
combined_raw_df.loc[:, cot_df_columns] = combined_raw_df.loc[:, cot_df_columns].interpolate(method='time')
combined_raw_df.loc[:, balance_df_columns] = combined_raw_df.loc[:, balance_df_columns].interpolate(method='time')

# round to the nearest integer all columns that have the word 'trader' in them
trader_columns = [col for col in combined_raw_df.columns if 'trader' in col]
combined_raw_df.loc[:, trader_columns] = combined_raw_df.loc[:, trader_columns].round(0)

combined_df = combined_raw_df[combined_raw_df.index.dayofweek == 4] # Friday values

In [None]:
combined_df = combined_df.loc[(combined_df.index <= combined_df['wti_cush_spot'].last_valid_index()) & (combined_df.index >= combined_df['treas_2yr'].first_valid_index())]

In [None]:
def threshold_labeler(arr: np.array, thresh) -> np.array:
    arr2 = np.where(np.where(arr < thresh, -1, arr) > thresh, 1, np.where(arr < -thresh, -1, arr))
    arr2[~((arr2 == 1) | (arr2 == -1))] = 0
    return arr2


def signed_accumulation(arr):

    p_arr = np.zeros(arr.shape)

    _type = 0
    _counter = 0
    if arr[0] == 1:
        _type = 1
        _counter = 1
    if arr[0] == -1:
        _type = -1
        _counter = -1
    
    p_arr[0] = _counter
    
    for i, x in enumerate(arr[1:]):
        if x == _type:
            if _type == 1:
                _counter += 1
            if _type == -1:
                _counter -= 1
        else:
            _type = x
            _counter = x
        p_arr[i] = _counter

    return p_arr

threshold_labeler(np.arange(-4,5,1), 1)

In [None]:
df = combined_df[['wti_cush_spot_sd_distance']].copy()
for x in range(1,5):
    df[f'wti_cush_spot_sd_distance_{x}'] = 0
    df[f'wti_cush_spot_sd_distance_{x}'] = threshold_labeler(df['wti_cush_spot_sd_distance'].values, x)
    df[f'wti_cush_spot_sd_distance_{x}_accumulated'] = signed_accumulation(df[f'wti_cush_spot_sd_distance_{x}'].values)

combined_df = combined_df.join(df.drop('wti_cush_spot_sd_distance', axis=1, inplace=False))

In [None]:
# lags and target variable engineering

price_extension_variables = [c for c in combined_df.columns if 'sd_distance' in c]
cot_extension_variables = [c for c in combined_df.columns if 'extension' in c]

balance_variables = ['net_exports', 'total_production', 'cushing_stocks', 'total_us_inc_spr', 'spr']
balance_deviation_variables = ['total_us_ex_spr_5yr_deviation', 'cushing_stocks_5yr_deviation']
balance_yoy_variables = [c + "_yoy" for c in balance_variables]

macro_variables = macro_df.columns.to_list()
price_variables = ['rbob_m1', 'wti_cush_spot', 'ho_m1', '321_spread']

combined_df = combined_df.loc[:, price_extension_variables + cot_extension_variables + balance_variables + balance_deviation_variables + balance_yoy_variables + macro_variables + price_variables].copy()



target_variable = ['wti_cush_spot']
dependent_variables = [c for c in combined_df.columns if c not in target_variable]

# differenced variables
combined_df.loc[:, price_variables] = combined_df.loc[:, price_variables].diff(1)
combined_df.loc[:, balance_variables] = combined_df.loc[:, balance_variables].diff(1)
combined_df.loc[:, macro_variables] = combined_df.loc[:, macro_variables].diff(1)

# lags
combined_df.loc[:, dependent_variables] = combined_df.loc[:, dependent_variables].shift(1)


# target variable
combined_df[target_variable] = combined_df[target_variable].apply(np.sign)

In [None]:
combined_df.iloc[2:].dropna(subset=["wti_cush_spot"]).to_csv("data/final.csv", index=True)

In [None]:
#combined_df.to_csv("data/final.csv", index=True)

In [None]:
# combined_df = pd.merge(
#     mm_df, balance_df, left_index=True, right_index=True, how = "outer"
# )
# combined_df = combined_df.loc[combined_df.index >= mm_df.index.min()]
# combined_df[['crude_exports', 'crude_imports', 'total_production', 'cushing_stocks', 'total_us_inc_spr', 'spr']] = combined_df[['crude_exports', 'crude_imports', 'total_production', 'cushing_stocks', 'total_us_inc_spr', 'spr']].interpolate(method="linear")
# combined_df = combined_df.loc[combined_df.index.isin(mm_df.index), ['mm_long', 'mm_short', 'mm_net', 'crude_exports', 'crude_imports', 'total_production', 'cushing_stocks',  'total_us_inc_spr', 'spr']]
# combined_df = combined_df.join(price_df).join(macro_df)
# combined_df = combined_df.loc[(combined_df.index <= combined_df['wti_cush_spot'].last_valid_index()) & (combined_df.index >= combined_df['treas_2yr'].first_valid_index())]
# combined_df['treas_2yr'] = combined_df['treas_2yr'].astype(float)

# levels = ['cushing_stocks', 'total_us_inc_spr', 'wti_cush_spot', '321_spread']
# levels_df = combined_df[levels].copy()

# final_df = combined_df.diff().join(levels_df, rsuffix="_level")
# final_df['time_index'] = np.arange(0, len(final_df))

# final_df.iloc[1:].to_csv("data/final.csv")