In [12]:
import numpy as np
import pandas as pd

In [13]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED=42
seed_everything(SEED)

In [14]:
def reduce_memory_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [18]:
train = reduce_memory_usage(pd.read_csv("../../../data/preprocessed/stock_prices_train.csv"))
val = reduce_memory_usage(pd.read_csv("../../../data/preprocessed/stock_prices_supplemental.csv"))


Memory usage of dataframe is 427.10 MB
Memory usage after optimization is: 191.30 MB
Decreased by 55.2%
Memory usage of dataframe is 49.42 MB
Memory usage after optimization is: 22.13 MB
Decreased by 55.2%


In [19]:
train.head()

Unnamed: 0,Date,SecuritiesCode,Open,High,Low,Volume,Target,AdjustedClose,return_5days,ema_5days,...,return_30days,ema_30days,volatility_30days,return_60days,ema_60days,volatility_60days,Drawdown,Sector,Capitalization,Shares
0,2017-01-04,1301,2734.0,2755.0,2730.0,31400,0.000731,2742.0,0.0,2742.0,...,0.0,2742.0,0.0,0.0,2742.0,0.0,0.0,Foods,-0.141113,-0.162109
1,2017-01-05,1301,2743.0,2747.0,2735.0,17900,0.00292,2738.0,0.0,2740.666748,...,0.0,2741.741943,0.0,0.0,2741.868896,0.0,0.0,Foods,-0.141113,-0.162109
2,2017-01-06,1301,2734.0,2744.0,2720.0,19900,-0.001092,2740.0,0.0,2740.444336,...,0.0,2741.629639,0.0,0.0,2741.807617,0.0,0.004013,Foods,-0.141113,-0.162109
3,2017-01-10,1301,2745.0,2754.0,2735.0,24200,-0.0051,2748.0,0.0,2742.962891,...,0.0,2742.040527,0.0,0.0,2742.010498,0.0,0.008018,Foods,-0.141113,-0.162109
4,2017-01-11,1301,2748.0,2752.0,2737.0,9300,-0.003296,2745.0,2742.600098,2743.64209,...,0.0,2742.231445,0.0,0.0,2742.108643,0.0,0.006214,Foods,-0.141113,-0.162109


In [20]:
# replace missing values with 0
train.fillna(0, inplace=True)
val.fillna(0, inplace=True)


# encode categorical variables (Sector)
train = pd.get_dummies(train, columns=['Sector'])
val = pd.get_dummies(val, columns=['Sector'])

#keep 2022-06-24 for the test set
test = val[val['Date'] == '2022-06-24']
val = val[val['Date'] != '2022-06-24']



In [21]:
def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', stats.pearsonr(y_true, y_pred)[0], True

In [24]:
import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
val = val.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
print(train.columns)

Index(['Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Volume', 'Target',
       'AdjustedClose', 'return_5days', 'ema_5days', 'volatility_5days',
       'return_10days', 'ema_10days', 'volatility_10days', 'return_30days',
       'ema_30days', 'volatility_30days', 'return_60days', 'ema_60days',
       'volatility_60days', 'Drawdown', 'Capitalization', 'Shares',
       'Sector_Automobilestransportationequipment', 'Sector_Banks',
       'Sector_Commercialwholesaletrade', 'Sector_Constructionmaterials',
       'Sector_Electricappliancesprecisioninstruments',
       'Sector_Electricpowergas', 'Sector_Energyresources',
       'Sector_Financialsexbanks', 'Sector_Foods', 'Sector_Itservicesothers',
       'Sector_Machinery', 'Sector_Pharmaceutical',
       'Sector_Rawmaterialschemicals', 'Sector_Realestate',
       'Sector_Retailtrade', 'Sector_Steelnonferrousmetals',
       'Sector_Transportationlogistics'],
      dtype='object')


In [25]:
import time
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, KFold, TimeSeriesSplit, GroupKFold, StratifiedKFold
from scipy import stats

start = time.time()
params_lgb = {'learning_rate': 0.005,'metric':'None','objective': 'regression','boosting': 'gbdt','verbosity': 0,'n_jobs': -1,'force_col_wise':True}  
features = [c for c in train.columns if c not in ['SecuritiesCode','Date', 'Target, Drawdown']]
train_dataset = lgb.Dataset(train[features],train["Target"],feature_name = features )
val_dataset = lgb.Dataset(val[features], val["Target"],feature_name = features)

model = lgb.train(params = params_lgb, 
                train_set = train_dataset, 
                valid_sets = [train_dataset, val_dataset], 
                num_boost_round = 3000, 
                feval=feval_pearsonr,
                callbacks=[ lgb.early_stopping(stopping_rounds=300, verbose=True)])    

elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

[1]	training's pearsonr: 0.513384	valid_1's pearsonr: 0.565332
Training until validation scores don't improve for 300 rounds
[2]	training's pearsonr: 0.515574	valid_1's pearsonr: 0.568714
[3]	training's pearsonr: 0.519629	valid_1's pearsonr: 0.570208
[4]	training's pearsonr: 0.520023	valid_1's pearsonr: 0.570741
[5]	training's pearsonr: 0.521709	valid_1's pearsonr: 0.577062
[6]	training's pearsonr: 0.521785	valid_1's pearsonr: 0.576117
[7]	training's pearsonr: 0.523109	valid_1's pearsonr: 0.578298
[8]	training's pearsonr: 0.523253	valid_1's pearsonr: 0.577967
[9]	training's pearsonr: 0.524404	valid_1's pearsonr: 0.579105
[10]	training's pearsonr: 0.524577	valid_1's pearsonr: 0.579151
[11]	training's pearsonr: 0.525894	valid_1's pearsonr: 0.581651
[12]	training's pearsonr: 0.527633	valid_1's pearsonr: 0.583579
[13]	training's pearsonr: 0.527782	valid_1's pearsonr: 0.583466
[14]	training's pearsonr: 0.52893	valid_1's pearsonr: 0.584853
[15]	training's pearsonr: 0.529257	valid_1's pearson

In [26]:
from sklearn.metrics import mean_squared_error
#make prediction with our lightgbm model on the test set
preds = model.predict(test[features])

#compute the mse of the set on the test test
mse = mean_squared_error(test["Target"], preds)
print(mse)

0.00021150726905210475


In [28]:
from sklearn import tree

In [29]:
#train a tree model using tree.DecisionTreeRegressor without variables 'SecuritiesCode','Date', 'Target'
#merge train and val for this model
train = pd.concat([train, val])
clf = tree.DecisionTreeRegressor()
clf = clf.fit(train[features], train["Target"])

#make prediction with our tree model on the test set
preds = clf.predict(test[features])
#compute the mse of the set on the test test
mse = mean_squared_error(test["Target"], preds)
print(mse)

0.00039388830000341854
