In [16]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor

In [8]:
sample = pd.read_csv("example_test_files/sample_submission.csv")
sample.head()

Unnamed: 0,Date,SecuritiesCode,Rank
0,2021-12-06,1301,0
1,2021-12-06,1332,1
2,2021-12-06,1333,2
3,2021-12-06,1375,3
4,2021-12-06,1376,4


In [40]:
def calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    buf = df.groupby('Date').apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio, buf

def add_rank(df, col_name="pred"):
    df["Rank"] = df.groupby("Date")[col_name].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

## Overfitting

In [13]:
import optuna
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

In [10]:
path = ""
df_prices = pd.read_csv(f"{path}supplemental_files/stock_prices.csv")
df_prices.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112000 entries, 0 to 111999
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   RowId             112000 non-null  object 
 1   Date              112000 non-null  object 
 2   SecuritiesCode    112000 non-null  int64  
 3   Open              111716 non-null  float64
 4   High              111716 non-null  float64
 5   Low               111716 non-null  float64
 6   Close             111716 non-null  float64
 7   Volume            112000 non-null  int64  
 8   AdjustmentFactor  112000 non-null  float64
 9   ExpectedDividend  503 non-null     float64
 10  SupervisionFlag   112000 non-null  bool   
 11  Target            112000 non-null  float64
dtypes: bool(1), float64(7), int64(2), object(2)
memory usage: 9.5+ MB


In [11]:
def prep_prices(price):
    price.fillna(0,inplace=True)
    return price

In [14]:
df_prices = prep_prices(df_prices)
pd.options.display.float_format = '{:,.6g}'.format
df_prices.describe()

Unnamed: 0,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,Target
count,112000.0,112000.0,112000.0,112000.0,112000.0,112000.0,112000.0,112000.0,112000.0
mean,5883.05,2677.51,2710.21,2638.92,2673.38,599551.0,0.99994,0.105025,-0.00107086
std,2389.82,4254.6,4305.0,4192.79,4246.42,2289320.0,0.00567792,2.66888,0.0238096
min,1301.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,-0.355
25%,3915.75,1021.0,1034.0,1008.0,1021.0,31900.0,1.0,0.0,-0.0123288
50%,6200.5,1808.0,1831.0,1782.0,1806.0,96900.0,1.0,0.0,-0.000478469
75%,7938.25,2980.0,3015.0,2938.0,2977.0,366800.0,1.0,0.0,0.0103883
max,9997.0,79030.0,79250.0,78600.0,79080.0,117726000.0,1.0,297.0,0.597907


In [18]:
feats = ["SecuritiesCode","Open","High","Low","Close","Volume",
         "AdjustmentFactor","ExpectedDividend","SupervisionFlag"]
model = DecisionTreeRegressor()
model.fit(df_prices[feats],df_prices["Target"])
model.score(df_prices[feats],df_prices["Target"])

0.9992327107815193

In [45]:
# By Yuike - https://www.kaggle.com/code/ikeppyo/examples-of-higher-scores-than-perfect-predictions

# This function adjusts the predictions so that the daily spread return approaches a certain value.
        
def adjuster(df):
    def calc_pred(df, x, y, z):
        return df['Target'].where(df['Target'].abs() < x, df['Target'] * y + np.sign(df['Target']) * z)

    def objective(trial, df):
        x = trial.suggest_uniform('x', 0, 0.2)
        y = trial.suggest_uniform('y', 0, 0.1)
        z = trial.suggest_uniform('z', 0, 1e-3)
        df["Rank"] = calc_pred(df, x, y, z).rank(ascending=False, method="first") - 1 
        df["Rank"] = df["Rank"].astype("int")
        return calc_spread_return_per_day(df, 200, 2)
    
    def predictor_per_day(df):
        study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=2022))
        study.optimize(lambda trial: abs(objective(trial, df) - 10), 100)
        return calc_pred(df, *study.best_params.values())

    return df.groupby("Date").apply(predictor_per_day).reset_index(level=0, drop=True)

def _predictor_base(feature_df):
    return model.predict(feature_df[feats])

def _predictor_with_adjuster(feature_df):
    feature_df["Target"] = model.predict(feature_df[feats])
    return adjuster(feature_df)

In [46]:
predictor = _predictor_with_adjuster

In [47]:
prices = pd.read_csv("example_test_files/stock_prices.csv")
prices.head()

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag
0,20211206_1301,2021-12-06,1301,2982,2982,2965,2971,8900,1,,False
1,20211206_1332,2021-12-06,1332,592,599,588,589,1360800,1,,False
2,20211206_1333,2021-12-06,1333,2368,2388,2360,2377,125900,1,,False
3,20211206_1375,2021-12-06,1375,1230,1239,1224,1224,81100,1,,False
4,20211206_1376,2021-12-06,1376,1339,1372,1339,1351,6200,1,,False


In [48]:
preds = pd.DataFrame(adjuster(df_prices))
preds

Unnamed: 0,Target
0,-0.00326264
1,-0.00899281
2,-0.00996264
3,-0.0150316
4,0.00286738
...,...
111995,-0.0135922
111996,-0.0205811
111997,0.00576184
111998,-0.00234101


In [49]:
preds['Date'] = df_prices['Date']
res = add_rank(preds, col_name='Target')

In [50]:
calc_spread_return_sharpe(res)[0]

19.518274236600806

In [22]:
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for prices, options, financials, trades, secondary_prices, sample_prediction in iter_test:
    feature_df = df_prices[df_prices['Date'] == prices["Date"].iloc[0]].copy()
    feature_df["pred"] = adjuster(feature_df).iloc[0]
    feature_df = add_rank(feature_df)
    feature_map = feature_df.set_index('SecuritiesCode')['Rank'].to_dict()
    sample_prediction['Rank'] = sample_prediction['SecuritiesCode'].map(feature_map)
    env.predict(sample_prediction)

NameError: name 'jpx_tokyo_market_prediction' is not defined