In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [3]:
import pandas as pd
import numpy as np
import datetime as dt
import itertools
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
import catboost
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

# Seed 고정
seed_everything(4382)
np.random.seed(777)

In [121]:
# 데이터 로드
krs_Df = pd.read_csv('/content/drive/MyDrive/dacon/krs/train.csv')
sss = pd.read_csv('/content/drive/MyDrive/dacon/krs/sample_submission.csv')

# 함수 정의
def make_month(df):
    dt = df['일자'].astype('str')
    month_data = pd.to_datetime(dt)
    md = month_data.dt.month
    return md

def make_year(df):
    dt = df['일자'].astype('str')
    year_data = pd.to_datetime(dt)
    yd = year_data.dt.year
    return yd

def make_day(df):
    dt = df['일자'].astype('str')
    year_data = pd.to_datetime(dt)
    dd = year_data.dt.day
    return dd

def make_weekday(df):
    dt = df['일자'].astype('str')
    year_data = pd.to_datetime(dt)
    wd = year_data.dt.weekday
    return wd

def make_gap (df):
    df['gap_minmax'] = df['고가']-df['저가']
    df['gap_day'] = df['시가']-df['종가']

def Feature_Engineering(df,code_name):

    df = df[df['종목코드'] == code_name]
    make_gap (df)

    #df['month'] = make_month(df)
    #df['year'] = make_year(df)
    #df['day'] = make_day(df)
    #df['weekday'] = make_weekday(df)

    df['increase'] = 0
    df['increase2'] = 0

    df.loc[(df['gap_day'] < 0), 'increase'] = -1
    df.loc[(df['gap_day'] > 0), 'increase'] = 1

    df['전날거래량'] = df['거래량'].shift(1)
    df['전날거래량'].fillna(method='bfill',inplace=True)

    df['거래량차'] = df['거래량'] - df['전날거래량']
    df.loc[(df['거래량차'] < 0), 'increase2'] = -1
    df.loc[(df['거래량차'] > 0), 'increase2'] = 1

    df['target1'] = df['시가'].shift(-1)
    df['target1'].fillna(method='bfill',inplace=True)

    df['target2'] = df['고가'].shift(-1)
    df['target2'].fillna(method='bfill',inplace=True)

    df['target3'] = df['저가'].shift(-1)
    df['target3'].fillna(method='bfill',inplace=True)

    df['target4'] = df['종가'].shift(-1)
    df['target4'].fillna(method='bfill',inplace=True)

    df['target5'] = df['increase2'].shift(-1)
    df['target5'].fillna(method='bfill',inplace=True)

    df = df.drop(columns=['전날거래량','거래량','거래량차'])

    print("== finish ==")
    return df.reset_index(drop=True)

def make_shift(df, n_days):

    columns = [ '시가', '고가', '저가', '종가', 'gap_minmax',
                'gap_day','increase','increase2']
    for col in columns:
        for s in range(1,n_days+1):
            df['shift_{}_{}'.format(col,s)] = df[col].shift(s)

def Add_predicted_data_to_train(df):

    make_gap(df)

    df['increase'] = 0

    df.loc[(df['gap_day'] < 0), 'increase'] = -1
    df.loc[(df['gap_day'] > 0), 'increase'] = 1

    df['target1'] = df['시가'].shift(-1)
    df['target1'].fillna(method='bfill',inplace=True)

    df['target2'] = df['고가'].shift(-1)
    df['target2'].fillna(method='bfill',inplace=True)

    df['target3'] = df['저가'].shift(-1)
    df['target3'].fillna(method='bfill',inplace=True)

    df['target4'] = df['종가'].shift(-1)
    df['target4'].fillna(method='bfill',inplace=True)

    df['target5'] = df['increase2'].shift(-1)
    df['target5'].fillna(method='bfill',inplace=True)

    return df.reset_index(drop=True)

In [125]:
krs_Df = pd.read_csv('/content/drive/MyDrive/dacon/krs/train.csv')
stock_2000 = krs_Df['종목코드'].unique()

for code in tqdm(stock_2000[0:2]):
    tmp_df = Feature_Engineering(krs_Df,code)
    make_shift(tmp_df,7)

    train = tmp_df.iloc[7:-1].drop(columns=['일자','종목코드','종목명'])
    valid = tmp_df.iloc[-1:].drop(columns=['일자','종목코드','종목명'])

    train_x = train.drop(columns=['target1','target2','target3','target4','target5'])
    train_y = train[['target1','target2','target3','target4','target5']]
    valid_x = valid.drop(columns=['target1','target2','target3','target4','target5'])

    target1 = catboost.CatBoostRegressor(random_state=138, verbose=500, iterations=500)
    target1.fit(train_x, train_y['target1'])

    target2 = catboost.CatBoostRegressor(random_state=138, verbose=500, iterations=500)
    target2.fit(train_x, train_y['target2'])

    target3 = catboost.CatBoostRegressor(random_state=138, verbose=500, iterations=500)
    target3.fit(train_x, train_y['target3'])

    target4 = catboost.CatBoostRegressor(random_state=138, verbose=500, iterations=500)
    target4.fit(train_x, train_y['target4'])

    target5 = catboost.CatBoostClassifier(random_state=138, verbose=500, iterations=500)
    target5.fit(train_x, train_y['target5'])

    for _ in range(15):

        target1_pred = target1.predict(valid_x)
        target2_pred = target2.predict(valid_x)
        target3_pred = target3.predict(valid_x)
        target4_pred = target4.predict(valid_x)
        target5_pred = target5.predict(valid_x)

        pred_df = pd.DataFrame({'시가' : [target1_pred[0]],
                            '고가' : [target2_pred[0]],
                            '저가' : [target3_pred[0]],
                            '종가' : [target4_pred[0]] ,
                            'increase2' : [target5_pred[0]]})

        Add_predicted_data_to_train(pred_df)
        train = pd.concat([train_x,pred_df], ignore_index = True)
        make_shift(train.iloc[-30:],7)
        valid_x = train.iloc[-1:]

  0%|          | 0/2 [00:00<?, ?it/s]

== finish ==
Learning rate set to 0.064181
0:	learn: 543.3074346	total: 10.2ms	remaining: 5.11s
499:	learn: 7.2603078	total: 5.91s	remaining: 0us
Learning rate set to 0.064181
0:	learn: 564.7011065	total: 20.8ms	remaining: 10.4s
499:	learn: 20.3862198	total: 6.1s	remaining: 0us
Learning rate set to 0.064181
0:	learn: 525.9366259	total: 9.71ms	remaining: 4.85s
499:	learn: 9.2283948	total: 4.96s	remaining: 0us
Learning rate set to 0.064181
0:	learn: 543.5557327	total: 10.3ms	remaining: 5.16s
499:	learn: 17.6599332	total: 7.03s	remaining: 0us
Learning rate set to 0.014295
0:	learn: 0.6909029	total: 10.2ms	remaining: 5.09s
499:	learn: 0.2679806	total: 5.16s	remaining: 0us


 50%|█████     | 1/2 [00:30<00:30, 30.50s/it]

== finish ==
Learning rate set to 0.064181
0:	learn: 787.3513986	total: 18.7ms	remaining: 9.34s
499:	learn: 12.1487446	total: 6.84s	remaining: 0us
Learning rate set to 0.064181
0:	learn: 825.7535216	total: 9.5ms	remaining: 4.74s
499:	learn: 24.4694067	total: 4.72s	remaining: 0us
Learning rate set to 0.064181
0:	learn: 764.1364317	total: 9.54ms	remaining: 4.76s
499:	learn: 17.6579735	total: 5.2s	remaining: 0us
Learning rate set to 0.064181
0:	learn: 793.3227720	total: 19ms	remaining: 9.47s
499:	learn: 23.9188417	total: 6.29s	remaining: 0us
Learning rate set to 0.014295
0:	learn: 0.6897013	total: 10.7ms	remaining: 5.33s
499:	learn: 0.2676641	total: 5.41s	remaining: 0us


100%|██████████| 2/2 [01:00<00:00, 30.14s/it]


In [126]:
valid_x

Unnamed: 0,시가,고가,저가,종가,gap_minmax,gap_day,increase,increase2,shift_시가_1,shift_시가_2,...,shift_increase2_3,shift_increase2_4,shift_increase2_5,shift_increase2_6,shift_increase2_7,target1,target2,target3,target4,target5
486,4918.391127,5081.997051,4781.108225,4872.928903,300.888826,45.462225,1,-1.0,,,...,,,,,,,,,,


In [123]:
tmp_df = Feature_Engineering(krs_Df,code)
make_shift(tmp_df,7)

train = tmp_df.iloc[7:-1].drop(columns=['일자','종목코드','종목명'])
valid = tmp_df.iloc[-1:].drop(columns=['일자','종목코드','종목명'])

train_x = train.drop(columns=['target1','target2','target3','target4','target5'])
train_y = train[['target1','target2','target3','target4','target5']]
valid_x = valid.drop(columns=['target1','target2','target3','target4','target5'])

== finish ==


In [124]:
train_x

Unnamed: 0,시가,고가,저가,종가,gap_minmax,gap_day,increase,increase2,shift_시가_1,shift_시가_2,...,shift_increase_5,shift_increase_6,shift_increase_7,shift_increase2_1,shift_increase2_2,shift_increase2_3,shift_increase2_4,shift_increase2_5,shift_increase2_6,shift_increase2_7


In [109]:
pred_df = pd.DataFrame({'시가' : [target1_pred[0]],
                        '고가' : [target2_pred[0]],
                        '저가' : [target3_pred[0]],
                        '종가' : [target4_pred[0]] ,
                        'increase2' : [target5_pred[0]]})

In [112]:
pd.concat([train_x,pred_df], ignore_index = True)

Unnamed: 0,시가,고가,저가,종가,gap_minmax,gap_day,increase,increase2,shift_시가_1,shift_시가_2,...,shift_increase_5,shift_increase_6,shift_increase_7,shift_increase2_1,shift_increase2_2,shift_increase2_3,shift_increase2_4,shift_increase2_5,shift_increase2_6,shift_increase2_7
0,3105.000000,3185.000000,3010.000000,3150.000000,175.0,-45.0,-1.0,-1.000000,3105.0,3155.0,...,0.0,1.0,-1.0,1.0,-1.0,1.0,1.0,1.0,-1.0,0.0
1,3145.000000,3175.000000,3060.000000,3085.000000,115.0,60.0,1.0,-1.000000,3105.0,3105.0,...,-1.0,0.0,1.0,-1.0,1.0,-1.0,1.0,1.0,1.0,-1.0
2,3085.000000,3100.000000,3030.000000,3080.000000,70.0,5.0,1.0,-1.000000,3145.0,3105.0,...,-1.0,-1.0,0.0,-1.0,-1.0,1.0,-1.0,1.0,1.0,1.0
3,3090.000000,3230.000000,3060.000000,3150.000000,170.0,-60.0,-1.0,1.000000,3085.0,3145.0,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,1.0
4,3130.000000,3180.000000,3085.000000,3160.000000,95.0,-30.0,-1.0,-1.000000,3090.0,3085.0,...,0.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482,2300.000000,2315.000000,2260.000000,2285.000000,55.0,15.0,1.0,-1.000000,2295.0,2300.0,...,1.0,1.0,-1.0,-1.0,1.0,1.0,-1.0,1.0,-1.0,-1.0
483,2280.000000,2300.000000,2255.000000,2280.000000,45.0,0.0,0.0,-1.000000,2300.0,2295.0,...,-1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,-1.0,1.0,-1.0
484,2260.000000,2285.000000,2235.000000,2245.000000,50.0,15.0,1.0,1.000000,2280.0,2300.0,...,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,1.0
485,2245.000000,2915.000000,2230.000000,2910.000000,685.0,-665.0,-1.0,1.000000,2260.0,2280.0,...,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0


In [None]:
Add_predicted_data_to_train(pred_df,7)

In [90]:
target5 = catboost.CatBoostClassifier(random_state=138, verbose=500, iterations=500)
target5.fit(train_x, train_y['target5'])
target5_pred = target5.predict(valid_x)

Learning rate set to 0.014295
0:	learn: 0.6909029	total: 22ms	remaining: 11s
499:	learn: 0.2679806	total: 8.51s	remaining: 0us


In [79]:
valid_y['target5'] - target5_pred

484    0.0
485    2.0
486    2.0
487    0.0
488   -2.0
489   -2.0
490    0.0
491    2.0
492    2.0
493    NaN
Name: target5, dtype: float64

In [45]:
train = tmp_df.iloc[7:-10].drop(columns=['일자','종목코드','종목명'])
valid = tmp_df[-10:].drop(columns=['일자','종목코드','종목명'])

train_x = train.drop(columns=['target1','target2','target3','target4','target5'])
train_y = train[['target1','target2','target3','target4','target5']]

valid_x = valid.drop(columns=['target1','target2','target3','target4','target5'])
valid_y = valid[['target1','target2','target3','target4','target5']]

In [91]:
target1 = catboost.CatBoostRegressor(random_state=138, verbose=500, iterations=500)
target1.fit(train_x, train_y['target1'])
target1_pred = target1.predict(valid_x)

Learning rate set to 0.064181
0:	learn: 543.3074346	total: 9.9ms	remaining: 4.94s
499:	learn: 7.2603078	total: 6.11s	remaining: 0us


In [58]:
valid_y['target2'] - target2_pred

484     34.109799
485    -16.209091
486     25.435661
487     17.896371
488     12.665185
489     -2.364791
490    -20.776123
491    635.479096
492    860.552658
493           NaN
Name: target2, dtype: float64

In [69]:
valid_y['target4'] - (target1_pred+target2_pred+target3_pred)/3

484     31.686769
485     30.277140
486     22.760527
487     29.971178
488      7.716622
489      3.978082
490    -31.748332
491    657.973981
492    450.266598
493           NaN
Name: target4, dtype: float64

In [56]:
valid_y['target1'] - target1_pred

484      7.297258
485     18.596416
486     26.396298
487     15.725052
488     16.878738
489     -7.490531
490    -17.029350
491     -7.102837
492    229.875356
493           NaN
Name: target1, dtype: float64

In [92]:
target2 = catboost.CatBoostRegressor(random_state=138, verbose=500, iterations=500)
target2.fit(train_x, train_y['target2'])
target2_pred = target2.predict(valid_x)

Learning rate set to 0.064181
0:	learn: 564.7011065	total: 9.83ms	remaining: 4.9s
499:	learn: 20.3862198	total: 7.12s	remaining: 0us


In [93]:
target3 = catboost.CatBoostRegressor(random_state=138, verbose=500, iterations=500)
target3.fit(train_x, train_y['target3'])
target3_pred = target3.predict(valid_x)

Learning rate set to 0.064181
0:	learn: 525.9366259	total: 11.4ms	remaining: 5.7s
499:	learn: 9.2283948	total: 5.21s	remaining: 0us


In [94]:
target4 = catboost.CatBoostRegressor(random_state=138, verbose=500, iterations=500)
target4.fit(train_x, train_y['target4'])
target4_pred = target4.predict(valid_x)

Learning rate set to 0.064181
0:	learn: 543.5557327	total: 9.53ms	remaining: 4.76s
499:	learn: 17.6599332	total: 7.01s	remaining: 0us


In [60]:
valid_y['target3'] - target3_pred

484    23.653252
485    38.444096
486    31.449621
487    -8.707887
488    13.605942
489    16.789568
490   -12.439523
491     5.545684
492    50.371781
493          NaN
Name: target3, dtype: float64

In [39]:
target5_pred

array([-1.])

In [44]:
target4_pred

array([2891.11339323])

In [51]:
valid_y['target4'] - target4_pred

484     29.703088
485     28.405786
486     29.537307
487     46.779499
488     17.748423
489     19.507464
490     -2.114246
491    657.755753
492    528.408131
493           NaN
Name: target4, dtype: float64

In [47]:
valid_y['target4'] - target4_pred

484     41.116610
485     29.407805
486     22.678336
487     27.141238
488     10.035243
489     30.360386
490    -12.898493
491    662.840427
492    498.752584
493           NaN
Name: target4, dtype: float64

In [52]:
valid_y['target4']

484    2235.0
485    2275.0
486    2290.0
487    2315.0
488    2285.0
489    2280.0
490    2245.0
491    2910.0
492    3015.0
493       NaN
Name: target4, dtype: float64

In [54]:
target4_pred

array([2205.29691239, 2246.59421396, 2260.46269322, 2268.22050145,
       2267.25157672, 2260.49253595, 2247.1142465 , 2252.24424705,
       2486.59186947, 2805.50754315])