In [6]:
import numpy as np
import pandas as pd

In [7]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED=42
seed_everything(SEED)

In [8]:
def reduce_memory_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [33]:
train = reduce_memory_usage(pd.read_csv("../../../data/preprocessed/stock_prices_train_1_7_30_60.csv"))
val = reduce_memory_usage(pd.read_csv("../../../data/preprocessed/stock_prices_supplemental_1_7_30_60.csv"))


Memory usage of dataframe is 284.73 MB
Memory usage after optimization is: 124.57 MB
Decreased by 56.2%
Memory usage of dataframe is 32.94 MB
Memory usage after optimization is: 14.41 MB
Decreased by 56.2%


In [34]:
train.head()

Unnamed: 0,Date,SecuritiesCode,Open,High,Low,Volume,Target,AdjustedClose,return_1days,volatility_1days,return_7days,volatility_7days,return_30days,volatility_30days,return_60days,volatility_60days
0,2017-01-04,1301,2734.0,2755.0,2730.0,31400,0.000731,2742.0,2742.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2017-01-05,1301,2743.0,2747.0,2735.0,17900,0.00292,2738.0,2738.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2017-01-06,1301,2734.0,2744.0,2720.0,19900,-0.001092,2740.0,2740.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017-01-10,1301,2745.0,2754.0,2735.0,24200,-0.0051,2748.0,2748.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2017-01-11,1301,2748.0,2752.0,2737.0,9300,-0.003296,2745.0,2745.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
#keep 2022-06-24 for the test set
test = val[val['Date'] == '2022-06-24']
val = val[val['Date'] != '2022-06-24']

In [36]:
def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', stats.pearsonr(y_true, y_pred)[0], True

In [37]:
import time
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, KFold, TimeSeriesSplit, GroupKFold, StratifiedKFold
from scipy import stats

start = time.time()
params_lgb = {'learning_rate': 0.005,'metric':'None','objective': 'regression','boosting': 'gbdt','verbosity': 0,'n_jobs': -1,'force_col_wise':True}  
features = [c for c in train.columns if c not in ['SecuritiesCode','Date', 'Target']]
train_dataset = lgb.Dataset(train[features],train["Target"],feature_name = features )
val_dataset = lgb.Dataset(val[features], val["Target"],feature_name = features)

model = lgb.train(params = params_lgb, 
                train_set = train_dataset, 
                valid_sets = [train_dataset, val_dataset], 
                num_boost_round = 3000, 
                feval=feval_pearsonr,
                callbacks=[ lgb.early_stopping(stopping_rounds=300, verbose=True)])    

elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

[1]	training's pearsonr: 0.0474462	valid_1's pearsonr: -0.0327026
Training until validation scores don't improve for 300 rounds
[2]	training's pearsonr: 0.0474462	valid_1's pearsonr: -0.0327026
[3]	training's pearsonr: 0.0475192	valid_1's pearsonr: -0.0329411
[4]	training's pearsonr: 0.0481344	valid_1's pearsonr: -0.0331592
[5]	training's pearsonr: 0.0480115	valid_1's pearsonr: -0.0332017
[6]	training's pearsonr: 0.0481787	valid_1's pearsonr: -0.0334322
[7]	training's pearsonr: 0.048216	valid_1's pearsonr: -0.0334421
[8]	training's pearsonr: 0.0488622	valid_1's pearsonr: -0.033486
[9]	training's pearsonr: 0.0488026	valid_1's pearsonr: -0.0334795
[10]	training's pearsonr: 0.0491953	valid_1's pearsonr: -0.0335332
[11]	training's pearsonr: 0.0494346	valid_1's pearsonr: -0.0335638
[12]	training's pearsonr: 0.049427	valid_1's pearsonr: -0.0336699
[13]	training's pearsonr: 0.0495745	valid_1's pearsonr: -0.0336798
[14]	training's pearsonr: 0.0504838	valid_1's pearsonr: -0.0339984
[15]	trainin

In [38]:
from sklearn.metrics import mean_squared_error
#make prediction with our lightgbm model on the test set
preds = model.predict(test[features])

#compute the mse of the set on the test test
mse = mean_squared_error(test["Target"], preds)
print(mse)

0.0003544228979786141
