In [6]:
import numpy as np
import pandas as pd
import math
import os
from scipy import stats
import lightgbm as lgb
#import data.jpx_tokyo_market_prediction
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import cross_val_score, KFold, TimeSeriesSplit, GroupKFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
import joblib
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [7]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED=42
seed_everything(SEED)

In [8]:
def reduce_memory_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [26]:
train = reduce_memory_usage(pd.read_csv("data/train_files/stock_prices.csv"))
train=train.drop(columns=['RowId','ExpectedDividend','AdjustmentFactor','SupervisionFlag']).dropna().reset_index(drop=True)
val = reduce_memory_usage(pd.read_csv("data/supplemental_files/stock_prices.csv"))
val = val.drop(columns=['RowId','ExpectedDividend','AdjustmentFactor','SupervisionFlag'])

Memory usage of dataframe is 197.98 MB
Memory usage after optimization is: 102.33 MB
Decreased by 48.3%
Memory usage of dataframe is 22.91 MB
Memory usage after optimization is: 11.84 MB
Decreased by 48.3%


In [27]:
train['Date'] = pd.to_datetime(train['Date'])
val['Date'] = pd.to_datetime(val['Date'])

In [28]:
def add_features(feats):
    feats["return_1month"] = feats["Close"].pct_change(20)
    feats["return_2month"] = feats["Close"].pct_change(40)
    feats["return_3month"] = feats["Close"].pct_change(60)
    feats["volatility_1month"] = (
        np.log(feats["Close"]).diff().rolling(20).std()
    )
    feats["volatility_2month"] = (
        np.log(feats["Close"]).diff().rolling(40).std()
    )
    feats["volatility_3month"] = (
        np.log(feats["Close"]).diff().rolling(60).std()
    )
    feats["MA_gap_1month"] = feats["Close"] / (
        feats["Close"].rolling(20).mean()
    )
    feats["MA_gap_2month"] = feats["Close"] / (
        feats["Close"].rolling(40).mean()
    )
    feats["MA_gap_3month"] = feats["Close"] / (
        feats["Close"].rolling(60).mean()
    )
    return feats


In [29]:
val = add_features(val)
train = add_features(train)

In [31]:
print(val.shape)
print(train.shape)

(269881, 17)
(2324923, 17)
['2021-12-06T00:00:00.000000000' '2021-12-07T00:00:00.000000000'
 '2021-12-08T00:00:00.000000000' '2021-12-09T00:00:00.000000000'
 '2021-12-10T00:00:00.000000000' '2021-12-13T00:00:00.000000000'
 '2021-12-14T00:00:00.000000000' '2021-12-15T00:00:00.000000000'
 '2021-12-16T00:00:00.000000000' '2021-12-17T00:00:00.000000000'
 '2021-12-20T00:00:00.000000000' '2021-12-21T00:00:00.000000000'
 '2021-12-22T00:00:00.000000000' '2021-12-23T00:00:00.000000000'
 '2021-12-24T00:00:00.000000000' '2021-12-27T00:00:00.000000000'
 '2021-12-28T00:00:00.000000000' '2021-12-29T00:00:00.000000000'
 '2021-12-30T00:00:00.000000000' '2022-01-04T00:00:00.000000000'
 '2022-01-05T00:00:00.000000000' '2022-01-06T00:00:00.000000000'
 '2022-01-07T00:00:00.000000000' '2022-01-11T00:00:00.000000000'
 '2022-01-12T00:00:00.000000000' '2022-01-13T00:00:00.000000000'
 '2022-01-14T00:00:00.000000000' '2022-01-17T00:00:00.000000000'
 '2022-01-18T00:00:00.000000000' '2022-01-19T00:00:00.000000000

In [32]:
#keep 2022-06-24 for the test set
test = val[val['Date'] == '2022-06-24']
val = val[val['Date'] != '2022-06-24']

In [33]:
def feval_rmse(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'rmse', mean_squared_error(y_true, y_pred), False

def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', stats.pearsonr(y_true, y_pred)[0], True

def calc_spread_return_per_day(df, portfolio_size=200, toprank_weight_ratio=2):
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size=200, toprank_weight_ratio=2):
    buf = df.groupby('Date').apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio#, buf

def add_rank(df):
    df["Rank"] = df.groupby("Date")["Target"].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

def fill_nan_inf(df):
    df = df.fillna(0)
    df = df.replace([np.inf, -np.inf], 0)
    return df

def check_score(df,preds,Securities_filter=[]):
    tmp_preds=df[['Date','SecuritiesCode']].copy()
    tmp_preds['Target']=preds
    
    #Rank Filter. Calculate median for this date and assign this value to the list of Securities to filter.
    tmp_preds['target_mean']=tmp_preds.groupby("Date")["Target"].transform('median')
    tmp_preds.loc[tmp_preds['SecuritiesCode'].isin(Securities_filter),'Target']=tmp_preds['target_mean']
    
    tmp_preds = add_rank(tmp_preds)
    df['Rank']=tmp_preds['Rank']
    score=round(calc_spread_return_sharpe(df, portfolio_size= 200, toprank_weight_ratio= 2),5)
    score_mean=round(df.groupby('Date').apply(calc_spread_return_per_day, 200, 2).mean(),5)
    score_std=round(df.groupby('Date').apply(calc_spread_return_per_day, 200, 2).std(),5)
    print(f'Competition_Score:{score}, rank_score_mean:{score_mean}, rank_score_std:{score_std}')

In [34]:
all_data = pd.concat([train, val])

In [35]:
list_spred_l=list((all_data.groupby('SecuritiesCode')['Target'].max()-all_data.groupby('SecuritiesCode')['Target'].min()).sort_values()[:1000].index)
list_spred_h=list((all_data.groupby('SecuritiesCode')['Target'].max()-all_data.groupby('SecuritiesCode')['Target'].min()).sort_values()[1000:].index)

In [36]:
import time

start = time.time()
# Training just with Securities with hight target_spread and validated with Securities with low target_spread.

# features =['High','Low','Open','Close','Volume', 'return_1month', 'return_2month', 'return_3month', 'volatility_1month', 'volatility_2month', 'volatility_3month',
#        'MA_gap_1month', 'MA_gap_2month', 'MA_gap_3month']
features =['High','Low','Open','Close','Volume',
       'MA_gap_1month', 'MA_gap_2month', 'MA_gap_3month']
# features =['High','Low','Open','Close','Volume',]
train=fill_nan_inf(train)
val=fill_nan_inf(val)
all_data = fill_nan_inf(all_data)
params_lgb = {'learning_rate': 0.005,'metric':'None','objective': 'regression','boosting': 'gbdt','verbosity': 0,'n_jobs': -1,'force_col_wise':True}  

train_dataset = lgb.Dataset(all_data[all_data['SecuritiesCode'].isin(list_spred_h)][features],all_data[all_data['SecuritiesCode'].isin(list_spred_h)]["Target"],feature_name = features )
val_dataset = lgb.Dataset(all_data[all_data['SecuritiesCode'].isin(list_spred_l)][features], all_data[all_data['SecuritiesCode'].isin(list_spred_l)]["Target"],feature_name = features)

model = lgb.train(params = params_lgb, 
                train_set = train_dataset, 
                valid_sets = [train_dataset, val_dataset], 
                num_boost_round = 3000, 
                feval=feval_pearsonr,
                callbacks=[ lgb.early_stopping(stopping_rounds=300, verbose=True)])    

elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

[1]	training's pearsonr: 0.0297045	valid_1's pearsonr: 0.0111404
Training until validation scores don't improve for 300 rounds
[2]	training's pearsonr: 0.0297045	valid_1's pearsonr: 0.0111404
[3]	training's pearsonr: 0.0297175	valid_1's pearsonr: 0.0112094
[4]	training's pearsonr: 0.0297155	valid_1's pearsonr: 0.0111929
[5]	training's pearsonr: 0.0297186	valid_1's pearsonr: 0.0112224
[6]	training's pearsonr: 0.0320867	valid_1's pearsonr: 0.0121839
[7]	training's pearsonr: 0.0333168	valid_1's pearsonr: 0.0124904
[8]	training's pearsonr: 0.0329675	valid_1's pearsonr: 0.0123575
[9]	training's pearsonr: 0.0339463	valid_1's pearsonr: 0.0124117
[10]	training's pearsonr: 0.0346647	valid_1's pearsonr: 0.0124765
[11]	training's pearsonr: 0.0344319	valid_1's pearsonr: 0.0125227
[12]	training's pearsonr: 0.0342181	valid_1's pearsonr: 0.0125236
[13]	training's pearsonr: 0.0346622	valid_1's pearsonr: 0.0124359
[14]	training's pearsonr: 0.0349923	valid_1's pearsonr: 0.0124
[15]	training's pearsonr: 

Prescription: SAA policy

In [38]:
#make prediction with our lightgbm model on the test set
preds = model.predict(test[features])

#compute the mse of the set on the test test
mse = mean_squared_error(test["Target"], preds)
print(mse)

0.000353936029523914


In [40]:
from sklearn import tree
model = tree.DecisionTreeRegressor(criterion="squared_error", min_samples_leaf=2)
model = model.fit(all_data[features], all_data["Target"])

preds_cart =  model.predict(test[features])
mse = mean_squared_error(test["Target"], preds_cart)
print(mse)

ValueError: Input X contains NaN.
DecisionTreeRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# gather in the same sets all the datapoints of the train set that fall in the same leaf of the tree as the datapoints of the test set

for i, row in test.iterrows():
    