In [1]:
# 파이썬 ≥3.5 필수
import sys
assert sys.version_info >= (3, 5)

# 사이킷런 ≥0.20 필수
import sklearn
assert sklearn.__version__ >= "0.20"

# 공통 모듈 임포트
import numpy as np
import os

# 깔금한 그래프 출력을 위해
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

import arrow
import pandas as pd
import pickle
from IPython.display import display
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, OneHotEncoder, StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from tpot import TPOTRegressor

from bigquery_worker import BigqueryWorker



In [2]:
bigquery_worker = BigqueryWorker()
try:
    with open('datasets/df1.pickle', 'rb') as f:
        df1 = pickle.load(f)
except:
  df1 = bigquery_worker.get_daily_info_all('daily_items_indicator_info_all').drop_duplicates(['itemname', 'date'], 'first')
try:
    with open('datasets/df2.pickle', 'rb') as f:
        df2 = pickle.load(f)
except:
  df2 = bigquery_worker.get_daily_info_all('daily_items_info_all', start_date='2015-09-01').drop_duplicates(['itemname', 'date'], 'first')
df3 = bigquery_worker.get_itemcodes_info()
df4 = pd.concat([
    bigquery_worker.get_daily_item_info(('kospi', '코스피', 'kospi')),
    bigquery_worker.get_daily_item_info(('kosdaq', '코스닥', 'kosdaq')),
])

display(df1, df2, df3, df4)

Unnamed: 0,itemname,date,PER,PBR,PCR,PSR,ROE,ROA,OPROA,OPROE,EV_EBITDA,GP_A,NOS
0,네패스아크,2020-11-17,16.624,9.985,13.267,4.869,,,45.815,230.38,1.7,0.614,243268200.0
1,네패스아크,2020-11-18,15.357,9.223,12.256,4.497,,,45.815,230.38,1.7,0.614,224724500.0
2,네패스아크,2020-11-19,16.387,9.843,13.079,4.799,,,45.815,230.38,1.7,0.614,239810900.0
3,네패스아크,2020-11-20,17.998,10.810,14.364,5.271,,,45.815,230.38,1.7,0.614,263383400.0
4,아난티,2020-11-10,-14.982,2.050,-34.245,16.323,-13.52,-4.91,-2.524,-6.78,39.9,0.024,711916670.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2163145,와이오엠,2016-08-22,-6.012,4.002,,,-67.41,-27.73,0.000,0.00,,0.000,59248400.0
2163146,슈프리마,2016-07-01,0.000,2.552,,,,,0.000,0.00,,0.000,196824000.0
2163147,아이에이네트웍스,2016-08-01,-91.524,4.665,,,-7.51,-1.28,0.000,0.00,,0.000,35637724.0
2163148,와이오엠,2016-09-02,-6.012,4.002,,,-67.41,-27.73,0.000,0.00,,0.000,59248400.0


Unnamed: 0,itemname,date,open,high,low,close,volume
0,WI,2017-09-21,1104,1145,1024,1024,238155
1,WI,2019-07-03,1784,1814,1780,1792,57206
2,WI,2020-02-05,1820,1840,1780,1792,50494
3,WI,2016-03-07,2800,2908,2779,2816,690287
4,WI,2019-01-02,1849,1849,1754,1793,27881
...,...,...,...,...,...,...,...
2557905,피엔케이피부임상연구센타,2020-10-30,21800,23150,21750,22250,126879
2557906,피엔케이피부임상연구센타,2020-09-22,23500,23550,22150,22250,151167
2557907,피엔케이피부임상연구센타,2020-10-29,21950,22350,21300,21750,68232
2557908,피엔케이피부임상연구센타,2020-09-21,25400,25850,23400,23800,161173


Unnamed: 0,itemcode,itemname,market,sector,main_product,listing_date,settlement_month,representative_name,homepage,area
0,950140,잉글우드랩,kosdaq,기타 화학제품 제조업,기초화장품(화장품 제조),2016-10-14,12월,조현철,http://www.englewoodlab.com,미국
1,900100,뉴프라이드,kosdaq,자동차 부품 및 내장품 판매업,인터모달 신생 및 재생타이어,2010-04-21,12월,John Lee,http://www.npcims.com,미국
2,950200,소마젠,kosdaq,"그외 기타 전문, 과학 및 기술 서비스업","유전체 분석 서비스 (NGS, CES 등)",2020-07-13,12월,Ryan W. Kim (김운봉),,미국
3,950160,코오롱티슈진,kosdaq,기초 의약물질 및 생물학적 제제 제조업,골관절염 치료제,2017-11-06,12월,"노문종, 한성수",http://tissuegene.com,미국
4,950130,엑세스바이오,kosdaq,의료용품 및 기타 의약 관련제품 제조업,"말라리아 진단키트(RDT), HIV 진단키트(RDT)",2013-05-30,12월,최영호,http://www.accessbio.net,미국
...,...,...,...,...,...,...,...,...,...,...
2243,006220,제주은행,kospi,은행 및 저축기관,"은행업무,외국환업무,신탁업무",1972-12-28,12월,서현주,http://www.e-jejubank.com,제주특별자치도
2244,089590,제주항공,kospi,항공 여객 운송업,"여객운송서비스, 화물운송서비스",2015-11-06,12월,김이배,http://www.jejuair.net,제주특별자치도
2245,066110,한프,kosdaq,컴퓨터 및 주변장치 제조업,OPC Drum,2002-07-16,12월,유한성,http://www.baiksanopc.co.kr,제주특별자치도
2246,032350,롯데관광개발,kospi,여행사 및 기타 여행보조 서비스업,"국내외여행알선,관광개발,전세운수,항공권매매대행",2006-06-08,12월,"김기병, 백현, 김한준",http://lottetour.com,제주특별자치도


Unnamed: 0_level_0,itemname,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-04,kospi,1028,1066,1016,1059,195899000
2000-01-05,kospi,1006,1047,984,986,257696000
2000-01-06,kospi,1013,1014,953,960,203524000
2000-01-07,kospi,949,970,930,948,215664000
2000-01-10,kospi,979,994,965,987,240175000
...,...,...,...,...,...,...
2020-11-16,kosdaq,843,849,841,847,1619880000
2020-11-17,kosdaq,846,846,836,839,1789493000
2020-11-18,kosdaq,842,852,840,851,2330367000
2020-11-19,kosdaq,851,860,849,859,2249201000


In [3]:
def add_dm_dmv_columns(origin_df):
    df = origin_df.copy()
    nums = [5, 10, 20, 50, 120, 300]
    for n in nums:
        gdf = df.sort_values(['itemname', 'date']).groupby('itemname')
        df[f'{n}dm'] = gdf['close'].rolling(n).mean().reset_index(0, drop=True)
        df[f'{n}dmv'] = gdf['volume'].rolling(n).mean().reset_index(0, drop=True)
    for i, x in enumerate(nums):
        for y in nums[i + 1:]:
            df[f'{x}dm/{y}dm'] = df[f'{x}dm'] / df[f'{y}dm']
            df[f'{x}dmv/{y}dmv'] = df[f'{x}dmv'] / df[f'{y}dmv']
    df = df.drop([f'{n}dm' for n in nums] + [f'{n}dmv' for n in nums], axis=1)
    return df
        

dm_dmv_adder = FunctionTransformer(add_dm_dmv_columns)
df_1, df_2, df_3, df_4 = df1.copy(), dm_dmv_adder.fit_transform(df2), df3.copy(), dm_dmv_adder.fit_transform(df4)

In [33]:
def make_df(start_date='2017-01-02', end_date='2020-08-23', ror_months=3, move_days=30):
    date = arrow.get(start_date)
    df = pd.DataFrame()
    while date < arrow.get(end_date):        
        date2 = df_1[df_1.date >= date.shift(months=ror_months).format('YYYY-MM-DD')].date.min()
        date_str = date.format('YYYY-MM-DD')
        temp = df_1[df_1.date == date_str]
        temp2 = df_2[df_2.date == date_str].drop('date', axis=1)
        temp = temp.merge(temp2, how='inner', left_on='itemname', right_on='itemname').set_index('itemname')
        temp['ror'] = df_2[df_2.date == date2].set_index('itemname').close / temp.close
        temp[['market', 'sector', 'area']] = df_3.set_index('itemname')[['market', 'sector', 'area']]
        temp = temp.drop(['open', 'high', 'low', 'close', 'volume'], axis=1).reset_index()
        temp4 = df_4.loc[date_str].reset_index().drop(['open', 'high', 'low', 'close', 'volume'], axis=1)
        temp = temp.merge(
            temp4.rename(columns={'itemname': 'market'}), how='inner', left_on=['market', 'date'], right_on=['market', 'date'],
            suffixes=('_item', '_market')
        )
        df = df.append(temp, ignore_index=True)
        date = arrow.get(df1[df1.date >= date.shift(days=move_days).format('YYYY-MM-DD')].date.min())
    df = df.dropna()
    df = df.reset_index(drop=True)
    df['month'] = df['date'].dt.month.astype(str)
    return df

In [36]:
def split_fit(df, model):
    train_set = df[df.date < '2020-01-01'].set_index(['itemname', 'date'])
    test_set = df[df.date > '2020-01-01'][df.sector.isin(train_set.sector.unique())].set_index(['itemname', 'date'])
    X_train, X_test, y_train, y_test = train_set.drop(['ror'], axis=1), test_set.drop(['ror'], axis=1),\
        train_set['ror'].copy(), test_set['ror'].copy()
    cat_features = ["market", "sector", "area", "month"]
    num_features = list(set(X_train.columns) - set(cat_features))
    num_pipe = make_pipeline(StandardScaler(), Normalizer())
    pre_pipe = make_column_transformer((num_pipe, num_features), (OneHotEncoder(), cat_features))
    full_pipe = make_pipeline(pre_pipe, model)
    if isinstance(model, TPOTRegressor):
        X_train = pre_pipe.fit_transform(X_train).toarray()
        X_test = pre_pipe.transform(X_test).toarray()
        model.fit(X_train, y_train.to_numpy())
        return model, X_test, y_test.to_numpy()
    full_pipe.fit(X_train, y_train)
    return full_pipe, X_test, y_test

In [13]:
df = make_df(ror_months=1)
pipe, X_test, y_test = split_fit(df, model=LGBMRegressor(random_state=42))
predictions = pipe.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
display(ror_months, rmse)

  after removing the cwd from sys.path.


9

0.25881082134696054

In [39]:
for ror_months in [1, 3, 6, 9]:
    df = make_df(ror_months=ror_months)
    pipe, X_test, y_test = split_fit(df, model=LGBMRegressor(random_state=42))
    predictions = pipe.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    display(ror_months, rmse)

  after removing the cwd from sys.path.


1

0.21134151153515138

  after removing the cwd from sys.path.


3

0.41201900256276747

  after removing the cwd from sys.path.


6

0.7616103734583922

  after removing the cwd from sys.path.


9

0.8589417582403703

In [154]:
regressor_config_dict=dict()
regressor_config_dict['lightgbm.LGBMRegressor'] = {
    'boosting_type': ['gbdt', 'dart'],
    'learning_rate': [0.01, 0.05, 0.1],
}
regressor_config_dict['xgboost.XGBRegressor'] = {
    'n_estimators': [100],
    'max_depth': range(1, 11),
    'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
    'subsample': np.arange(0.05, 1.01, 0.05),
    'min_child_weight': range(1, 21),
    'nthread': [1],
    'objective': ['reg:squarederror']
}

In [31]:
df

Unnamed: 0,itemname,date,PER,PBR,PCR,PSR,ROE,ROA,OPROA,OPROE,...,20dmv/120dmv_market,20dm/300dm_market,20dmv/300dmv_market,50dm/120dm_market,50dmv/120dmv_market,50dm/300dm_market,50dmv/300dmv_market,120dm/300dm_market,120dmv/300dmv_market,month
0,이화전기,2017-01-02,-27.895,1.963,-33.125,2.585,-7.49,-5.01,0.978,1.357,...,0.834743,1.018601,0.738856,0.991538,0.850734,1.007705,0.753011,1.016305,0.885131,1
1,코콤,2017-01-02,23.908,1.779,20.607,1.516,7.89,6.34,7.470,9.362,...,0.834743,1.018601,0.738856,0.991538,0.850734,1.007705,0.753011,1.016305,0.885131,1
2,플랜티넷,2017-01-02,24.178,0.833,14.205,2.275,3.75,3.73,5.060,5.497,...,0.834743,1.018601,0.738856,0.991538,0.850734,1.007705,0.753011,1.016305,0.885131,1
3,라이브플렉스,2017-01-02,3.114,0.987,3.025,1.972,38.07,26.79,2.091,3.068,...,0.834743,1.018601,0.738856,0.991538,0.850734,1.007705,0.753011,1.016305,0.885131,1
4,마이크로컨텍솔,2017-01-02,-348.000,1.425,55.680,2.125,-0.41,-0.54,-1.342,-1.463,...,0.834743,1.018601,0.738856,0.991538,0.850734,1.007705,0.753011,1.016305,0.885131,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304487,한네트,2020-08-18,0.000,1.144,6.622,1.546,3.19,1.73,0.000,0.000,...,0.971428,1.113518,1.259182,1.091686,0.956106,1.068877,1.239323,0.979107,1.296218,8
304488,에이디칩스,2020-08-18,0.000,0.962,-1.780,1.171,-46.50,-30.37,0.000,0.000,...,0.971428,1.113518,1.259182,1.091686,0.956106,1.068877,1.239323,0.979107,1.296218,8
304489,케이씨에스,2020-08-18,0.000,2.338,15.208,1.224,14.10,8.49,0.000,0.000,...,0.971428,1.113518,1.259182,1.091686,0.956106,1.068877,1.239323,0.979107,1.296218,8
304490,흥구석유,2020-08-18,0.000,1.203,10.613,0.620,11.25,10.44,0.000,0.000,...,0.971428,1.113518,1.259182,1.091686,0.956106,1.068877,1.239323,0.979107,1.296218,8


In [12]:
for ror_months in range(3, 10, 3):
    df = make_df(ror_months=ror_months)
    for model in [LGBMRegressor(random_state=42), XGBRegressor(random_state=42), MLPRegressor(random_state=42)]:
        pipe, X_test, y_test = split_fit(df, model=model)
        predictions = pipe.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(mse)
        display(ror_months, model, rmse)

  after removing the cwd from sys.path.


3

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

0.4317755836038961

  after removing the cwd from sys.path.
  if getattr(data, 'base', None) is not None and \




3

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

0.4312989001418568

  after removing the cwd from sys.path.


3

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=200,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=42, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

0.49122526437816194

  after removing the cwd from sys.path.


6

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

0.80700503933476

  after removing the cwd from sys.path.
  if getattr(data, 'base', None) is not None and \




6

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

0.818661092668568

  after removing the cwd from sys.path.


6

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=200,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=42, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

0.8389800767565649

  after removing the cwd from sys.path.


9

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

0.8003529275802942

  after removing the cwd from sys.path.
  if getattr(data, 'base', None) is not None and \




9

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

0.8205824849189886

  after removing the cwd from sys.path.


9

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=200,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=42, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

0.8430186137580383

In [30]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit()
for train_index, test_index in tscv.split(df.set_index('date')):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [    0     1     2 ... 50749 50750 50751] TEST: [ 50752  50753  50754 ... 101497 101498 101499]
TRAIN: [     0      1      2 ... 101497 101498 101499] TEST: [101500 101501 101502 ... 152245 152246 152247]
TRAIN: [     0      1      2 ... 152245 152246 152247] TEST: [152248 152249 152250 ... 202993 202994 202995]
TRAIN: [     0      1      2 ... 202993 202994 202995] TEST: [202996 202997 202998 ... 253741 253742 253743]
TRAIN: [     0      1      2 ... 253741 253742 253743] TEST: [253744 253745 253746 ... 304489 304490 304491]


In [32]:
TPOTRegressor?

In [67]:
model = TPOTRegressor(
    generations=3, population_size=10, verbosity=3, random_state=42, cv=TimeSeriesSplit(n_splits=3), n_jobs=-1, memory='auto'
)
df = make_df(ror_months=3)
model, X_test, y_test = split_fit(df, model=model)
predictions = model.predict(X_test)
display(np.sqrt(-model.score(X_test, y_test)))
display(ror_months, np.sqrt(mean_squared_error(y_test, predictions)))

  after removing the cwd from sys.path.


30 operators have been imported by TPOT.


Version 0.11.6.post1 of tpot is outdated. Version 0.11.6.post2 was released 4 days ago.


HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=40.0, style=ProgressStyle(des…

Skipped pipeline #1 due to time out. Continuing to the next pipeline.
Skipped pipeline #7 due to time out. Continuing to the next pipeline.
Skipped pipeline #9 due to time out. Continuing to the next pipeline.
Skipped pipeline #13 due to time out. Continuing to the next pipeline.
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required..
Skipped pipeline #20 due to time out. Continuing to the next pipeline.
Skipped pipeline #22 due to time out. Continuing to the next pipeline.
Skipped pipeline #24 due to time out. Continuing to the next pipeline.
Skipped pipeline #26 due to time out. Continuing to the next pipeline.
Skipped pipeline #28 due to time out. Continuing to the next pipeline.

Generation 1 - Current Pareto front scores:

-1	-0.07141961166659098	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.45, RandomForestRegressor__min_samples_le

0.3992670920731064

1

0.3992670920731064

In [68]:
model.fitted_pipeline_

Pipeline(memory=Memory(location=C:\Users\gsr27\AppData\Local\Temp\tmpg_k61gvb\joblib),
         steps=[('stackingestimator',
                 StackingEstimator(estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                                   criterion='mse',
                                                                   max_depth=6,
                                                                   max_features=None,
                                                                   max_leaf_nodes=None,
                                                                   min_impurity_decrease=0.0,
                                                                   min_impurity_split=None,
                                                                   min_samples_leaf=13,
                                                                   min_samples_split=10,
                                                                   min_weight_frac...
      

In [69]:
model.pareto_front_fitted_pipelines_

{'ElasticNetCV(input_matrix, ElasticNetCV__l1_ratio=0.05, ElasticNetCV__tol=0.1)': Pipeline(memory=Memory(location=C:\Users\gsr27\AppData\Local\Temp\tmpg_k61gvb\joblib),
          steps=[('elasticnetcv',
                  ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001,
                               fit_intercept=True, l1_ratio=0.05, max_iter=1000,
                               n_alphas=100, n_jobs=None, normalize=False,
                               positive=False, precompute='auto',
                               random_state=42, selection='cyclic', tol=0.1,
                               verbose=0))],
          verbose=False),
 'RandomForestRegressor(DecisionTreeRegressor(input_matrix, DecisionTreeRegressor__max_depth=6, DecisionTreeRegressor__min_samples_leaf=13, DecisionTreeRegressor__min_samples_split=10), RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.05, RandomForestRegressor__min_samples_leaf=7, RandomForestRegressor__min_samples_spli

In [70]:
model.evaluated_individuals_

{'RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.7500000000000001, RandomForestRegressor__min_samples_leaf=11, RandomForestRegressor__min_samples_split=9, RandomForestRegressor__n_estimators=100)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 1,
  'internal_cv_score': -inf},
 'RandomForestRegressor(ElasticNetCV(input_matrix, ElasticNetCV__l1_ratio=0.75, ElasticNetCV__tol=0.01), RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.4, RandomForestRegressor__min_samples_leaf=16, RandomForestRegressor__min_samples_split=14, RandomForestRegressor__n_estimators=100)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 2,
  'internal_cv_score': -0.07256558824675415},
 'AdaBoostRegressor(input_matrix, AdaBoostRegressor__learning_rate=1.0, AdaBoostRegressor__loss=exponential, Ad

In [71]:
model.export('tpot_pipeline_20201205_3m_1.py')

In [72]:
ror_months = 3
df = make_df(ror_months=ror_months)
date = '2020-11-20'
temp = df_2[df_2.date == date].drop(['date', 'open', 'high', 'low', 'close', 'volume'], axis=1)
pdf = df_1[df_1.date == date].merge(temp, how='inner', left_on='itemname', right_on='itemname').set_index('itemname')
pdf[['market', 'sector', 'area']] = df_3.set_index('itemname')[['market', 'sector', 'area']]
temp4 = df_4.loc[date].reset_index(drop=True).drop(['open', 'high', 'low', 'close', 'volume'], axis=1).rename(
    columns={'itemname': 'market'})
pdf = pdf.reset_index().merge(temp4, how='inner', left_on='market', right_on='market', suffixes=('_item', '_market'))
pdf['month'] = pdf['date'].dt.month.astype(str)
pdf = pdf[pdf.sector.isin(df.sector.unique())].dropna().set_index(['itemname', 'date'])
predictions = model.predict(pdf)
pdf['pred_ror'] = predictions
pdf['pred_ror_rank'] = pdf['pred_ror'].rank(ascending=False)
pdf = pdf.sort_values('pred_ror_rank')
display(ror_months, model, pdf.head(30))

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [78]:
train_set = df[df.date < '2020-01-01'].set_index(['itemname', 'date'])
test_set = df[df.date > '2020-01-01'][df.sector.isin(train_set.sector.unique())].set_index(['itemname', 'date'])
X_train, X_test, y_train, y_test = train_set.drop(['ror'], axis=1), test_set.drop(['ror'], axis=1),\
    train_set['ror'].copy(), test_set['ror'].copy()
cat_features = ["market", "sector", "area", "month"]
num_features = list(set(X_train.columns) - set(cat_features))
num_pipe = make_pipeline(StandardScaler(), Normalizer())
pre_pipe = make_column_transformer((num_pipe, num_features), (OneHotEncoder(), cat_features))
pre_pipe.fit(X_train)
pdf = pdf[pdf.sector.isin(X_train.sector.unique())]
pdf_trans = pre_pipe.transform(pdf)
predictions = model.predict(pdf_trans.toarray())
pdf['pred_ror'] = predictions
pdf['pred_ror_rank'] = pdf['pred_ror'].rank(ascending=False)
pdf = pdf.sort_values('pred_ror_rank')
display(ror_months, model, pdf.head(30))

  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


3

TPOTRegressor(config_dict=None, crossover_rate=0.1,
              cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
              disable_update_check=False, early_stop=None, generations=3,
              log_file=None, max_eval_time_mins=5, max_time_mins=None,
              memory='auto', mutation_rate=0.9, n_jobs=-1, offspring_size=None,
              periodic_checkpoint_folder=None, population_size=10,
              random_state=42, scoring=None, subsample=1.0, template=None,
              use_dask=False, verbosity=3, warm_start=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,PER,PBR,PCR,PSR,ROE,ROA,OPROA,OPROE,EV_EBITDA,GP_A,...,20dmv/300dmv_market,50dm/120dm_market,50dmv/120dmv_market,50dm/300dm_market,50dmv/300dmv_market,120dm/300dm_market,120dmv/300dmv_market,month,pred_ror,pred_ror_rank
itemname,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
동양,2020-11-20,12.05,0.305,25.104,1.0,3.0,2.35,2.009,2.514,46.69,0.084,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.191247,1.0
성신양회,2020-11-20,2.246,0.441,7.767,0.551,20.17,7.66,9.247,24.042,8.39,0.199,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.190868,2.0
세이브존I&C,2020-11-20,-245.0,0.261,10.929,1.191,-0.11,-0.09,1.567,1.933,3.85,0.114,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.185098,3.0
부산산업,2020-11-20,37.916,2.249,24.59,1.627,5.99,4.24,4.655,7.398,12.25,0.168,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.181707,4.0
아세아시멘트,2020-11-20,4.915,0.32,6.78,0.66,6.56,2.81,5.583,13.037,5.59,0.109,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.149629,5.0
한일현대시멘트,2020-11-20,0.0,2.125,7.7,1.131,21.69,7.45,0.0,0.0,10.29,0.151,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.137207,6.0
텔코웨어,2020-11-20,15.228,0.82,113.208,5.553,7.14,6.64,5.19,5.545,29.09,0.138,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.127534,7.0
SH에너지화학,2020-11-20,-34.375,0.994,-14.224,1.384,-2.82,-2.19,-3.041,-3.956,28.03,0.041,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.123362,8.0
하이스틸,2020-11-20,17.12,0.251,15.734,0.279,1.47,0.77,1.515,2.866,13.03,0.052,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.122063,9.0
SBS미디어홀딩스,2020-11-20,-223.75,0.423,-99.444,3.653,-0.22,-0.27,-1.217,-1.365,27.4,0.042,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.121989,10.0


In [116]:
model = TPOTRegressor(
    generations=3, population_size=10, verbosity=3, random_state=42, cv=TimeSeriesSplit(n_splits=3), n_jobs=-1, memory='auto'
)
df = make_df(ror_months=9)
model, X_test, y_test = split_fit(df, model=model)
predictions = model.predict(X_test)
display(np.sqrt(-model.score(X_test, y_test)))
display(ror_months, np.sqrt(mean_squared_error(y_test, predictions)))

  after removing the cwd from sys.path.


30 operators have been imported by TPOT.


Version 0.11.6.post1 of tpot is outdated. Version 0.11.6.post2 was released 4 days ago.


HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=40.0, style=ProgressStyle(des…

Skipped pipeline #1 due to time out. Continuing to the next pipeline.
Skipped pipeline #7 due to time out. Continuing to the next pipeline.
Skipped pipeline #9 due to time out. Continuing to the next pipeline.
Skipped pipeline #13 due to time out. Continuing to the next pipeline.
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required..
Skipped pipeline #18 due to time out. Continuing to the next pipeline.
Skipped pipeline #21 due to time out. Continuing to the next pipeline.
Skipped pipeline #23 due to time out. Continuing to the next pipeline.
Skipped pipeline #25 due to time out. Continuing to the next pipeline.
Skipped pipeline #27 due to time out. Continuing to the next pipeline.
Skipped pipeline #29 due to time out. Continuing to the next pipeline.

Generation 1 - Current Pareto front scores:

-1	-0.25696149058112155	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=True, RandomFo

0.8652586105369425

3

0.8652586105369425

In [117]:
model.fitted_pipeline_

Pipeline(memory=Memory(location=C:\Users\gsr27\AppData\Local\Temp\tmpxng60pkj\joblib),
         steps=[('stackingestimator',
                 StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.85,
                                                                       ccp_alpha=0.0,
                                                                       criterion='friedman_mse',
                                                                       init=None,
                                                                       learning_rate=0.5,
                                                                       loss='lad',
                                                                       max_depth=1,
                                                                       max_features=0.5,
                                                                       max_leaf_nodes=None,
                                                                       min_impurity_decre

In [118]:
model.pareto_front_fitted_pipelines_

{'ElasticNetCV(input_matrix, ElasticNetCV__l1_ratio=0.05, ElasticNetCV__tol=0.1)': Pipeline(memory=Memory(location=C:\Users\gsr27\AppData\Local\Temp\tmpxng60pkj\joblib),
          steps=[('elasticnetcv',
                  ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001,
                               fit_intercept=True, l1_ratio=0.05, max_iter=1000,
                               n_alphas=100, n_jobs=None, normalize=False,
                               positive=False, precompute='auto',
                               random_state=42, selection='cyclic', tol=0.1,
                               verbose=0))],
          verbose=False),
 'LinearSVR(Nystroem(input_matrix, Nystroem__gamma=0.30000000000000004, Nystroem__kernel=polynomial, Nystroem__n_components=2), LinearSVR__C=1.0, LinearSVR__dual=True, LinearSVR__epsilon=0.1, LinearSVR__loss=epsilon_insensitive, LinearSVR__tol=0.1)': Pipeline(memory=Memory(location=C:\Users\gsr27\AppData\Local\Temp\tmpxng60pkj\joblib),
          s

In [119]:
model.evaluated_individuals_

{'RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.7500000000000001, RandomForestRegressor__min_samples_leaf=11, RandomForestRegressor__min_samples_split=9, RandomForestRegressor__n_estimators=100)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 1,
  'internal_cv_score': -inf},
 'RandomForestRegressor(ElasticNetCV(input_matrix, ElasticNetCV__l1_ratio=0.75, ElasticNetCV__tol=0.01), RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.4, RandomForestRegressor__min_samples_leaf=16, RandomForestRegressor__min_samples_split=14, RandomForestRegressor__n_estimators=100)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 2,
  'internal_cv_score': -0.2539304188346559},
 'AdaBoostRegressor(input_matrix, AdaBoostRegressor__learning_rate=1.0, AdaBoostRegressor__loss=exponential, Ada

In [120]:
model.export('tpot_pipeline_20201205_9m_1.py')

In [122]:
ror_months = 9
df = make_df(ror_months=ror_months)
date = '2020-11-20'
temp = df_2[df_2.date == date].drop(['date', 'open', 'high', 'low', 'close', 'volume'], axis=1)
pdf = df_1[df_1.date == date].merge(temp, how='inner', left_on='itemname', right_on='itemname').set_index('itemname')
pdf[['market', 'sector', 'area']] = df_3.set_index('itemname')[['market', 'sector', 'area']]
temp4 = df_4.loc[date].reset_index(drop=True).drop(['open', 'high', 'low', 'close', 'volume'], axis=1).rename(
    columns={'itemname': 'market'})
pdf = pdf.reset_index().merge(temp4, how='inner', left_on='market', right_on='market', suffixes=('_item', '_market'))
pdf['month'] = pdf['date'].dt.month.astype(str)

train_set = df[df.date < '2020-01-01'].set_index(['itemname', 'date'])
test_set = df[df.date > '2020-01-01'][df.sector.isin(train_set.sector.unique())].set_index(['itemname', 'date'])
X_train, X_test, y_train, y_test = train_set.drop(['ror'], axis=1), test_set.drop(['ror'], axis=1),\
    train_set['ror'].copy(), test_set['ror'].copy()
pdf = pdf[pdf.sector.isin(X_train.sector.unique())].dropna().set_index(['itemname', 'date'])

cat_features = ["market", "sector", "area", "month"]
num_features = list(set(X_train.columns) - set(cat_features))
num_pipe = make_pipeline(StandardScaler(), Normalizer())
pre_pipe = make_column_transformer((num_pipe, num_features), (OneHotEncoder(), cat_features))
pre_pipe.fit(X_train)
pdf_trans = pre_pipe.transform(pdf)
predictions = model.predict(pdf_trans.toarray())
pdf['pred_ror'] = predictions
pdf['pred_ror_rank'] = pdf['pred_ror'].rank(ascending=False)
pdf = pdf.sort_values('pred_ror_rank')
display(ror_months, model, pdf.head(30))

  del sys.path[0]


9

TPOTRegressor(config_dict=None, crossover_rate=0.1,
              cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
              disable_update_check=False, early_stop=None, generations=3,
              log_file=None, max_eval_time_mins=5, max_time_mins=None,
              memory='auto', mutation_rate=0.9, n_jobs=-1, offspring_size=None,
              periodic_checkpoint_folder=None, population_size=10,
              random_state=42, scoring=None, subsample=1.0, template=None,
              use_dask=False, verbosity=3, warm_start=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,PER,PBR,PCR,PSR,ROE,ROA,OPROA,OPROE,EV_EBITDA,GP_A,...,20dmv/300dmv_market,50dm/120dm_market,50dmv/120dmv_market,50dm/300dm_market,50dmv/300dmv_market,120dm/300dm_market,120dmv/300dmv_market,month,pred_ror,pred_ror_rank
itemname,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
메카로,2020-11-20,-61.48,0.849,28.897,3.488,-1.41,-2.87,-0.511,-0.573,11.61,0.151,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.034708,1.0
삼호개발,2020-11-20,5.421,0.517,12.677,0.583,10.05,6.4,4.56,7.341,4.41,0.086,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.034633,2.0
CSA 코스믹,2020-11-20,14.517,3.31,34.527,2.208,30.19,15.64,12.523,25.185,101.27,0.648,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.034533,3.0
클리오,2020-11-20,24.581,1.873,23.373,2.571,8.29,5.12,5.295,8.196,11.99,0.471,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.033548,4.0
KPX생명과학,2020-11-20,0.0,5.554,73.279,8.911,2.0,1.69,0.0,0.0,23.3,0.042,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.033135,5.0
대웅제약,2020-11-20,116.304,1.997,43.496,1.416,1.82,1.02,1.766,3.548,23.24,0.329,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.032892,6.0
한미약품,2020-11-20,-31.442,5.088,87.403,7.296,-0.168,-0.063,-0.065,-0.155,27.33,0.263,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.03272,7.0
모토닉,2020-11-20,33.929,0.875,24.204,2.917,2.72,2.54,3.66,3.911,15.13,0.064,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.031934,8.0
덕산테코피아,2020-11-20,24.609,1.882,20.455,5.264,10.78,9.11,7.199,7.807,18.2,0.168,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.031822,9.0
잉크테크,2020-11-20,11.605,1.493,17.984,1.587,13.18,4.76,6.484,17.391,49.5,0.126,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.031771,10.0


In [124]:
pipe, X_test, y_test = split_fit(df, model=LGBMRegressor(random_state=42))
predictions = pipe.predict(pdf)
pdf['pred_ror'] = predictions
pdf['pred_ror_rank'] = pdf['pred_ror'].rank(ascending=False)
pdf = pdf.sort_values('pred_ror_rank')
display(ror_months, model, pdf.head(30))

  after removing the cwd from sys.path.


9

TPOTRegressor(config_dict=None, crossover_rate=0.1,
              cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
              disable_update_check=False, early_stop=None, generations=3,
              log_file=None, max_eval_time_mins=5, max_time_mins=None,
              memory='auto', mutation_rate=0.9, n_jobs=-1, offspring_size=None,
              periodic_checkpoint_folder=None, population_size=10,
              random_state=42, scoring=None, subsample=1.0, template=None,
              use_dask=False, verbosity=3, warm_start=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,PER,PBR,PCR,PSR,ROE,ROA,OPROA,OPROE,EV_EBITDA,GP_A,...,20dmv/300dmv_market,50dm/120dm_market,50dmv/120dmv_market,50dm/300dm_market,50dmv/300dmv_market,120dm/300dm_market,120dmv/300dmv_market,month,pred_ror,pred_ror_rank
itemname,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
광림,2020-11-20,-4.551,1.309,60.484,1.387,-27.77,-9.81,-0.494,-1.327,45.72,0.056,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,2.400649,1.0
휴마시스,2020-11-20,10.57,8.766,39.181,16.032,102.58,68.38,89.069,124.294,213.69,0.972,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.955893,2.0
씨젠,2020-11-20,9.111,11.736,16.882,8.08,157.86,93.82,104.924,181.889,24.67,1.27,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.850951,3.0
바디텍메드,2020-11-20,11.326,5.961,20.235,7.389,60.1,49.38,56.647,72.593,11.45,0.789,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.793592,4.0
대림제지,2020-11-20,-3.576,0.505,6.916,0.732,-13.87,-8.99,-7.323,-10.952,6.34,-0.112,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.774036,5.0
파나진,2020-11-20,0.0,7.099,29.57,18.004,19.2,15.28,0.0,0.0,103.97,0.234,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.77349,6.0
한스바이오메드,2020-11-20,231.25,1.671,33.549,2.327,0.76,0.68,6.553,10.76,16.2,0.328,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.59319,7.0
국일제지,2020-11-20,-47.5,10.881,-1646.667,12.506,-23.66,-10.48,1.308,2.787,253.91,0.078,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.585656,8.0
네패스,2020-11-20,-21.32,3.773,28.886,4.268,-17.44,-5.61,-0.041,-0.141,6.16,0.054,...,1.292609,1.04052,1.009644,1.11655,1.198215,1.073069,1.18677,11,1.56155,9.0
모나리자,2020-11-20,0.0,3.105,30.385,1.715,3.75,3.14,0.0,0.0,18.95,0.32,...,1.363742,1.038941,1.155395,1.190983,1.416447,1.146343,1.225942,11,1.541521,10.0
