In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline
plt.rcParams['figure.figsize'] = (24, 12)
plt.rcParams['lines.linewidth'] = 3

In [4]:
data_list = ['000270.KS', '000660.KS', '005380.KS',
             '005490.KS', '005930.KS', '006400.KS',
             '035720.KS', '051910.KS', '207940.KS']

data_name = ['기아', 'SK하이닉스', '현대차',
             '포스코', '삼성전자', '삼성SDI',
             '카카오', 'LG화학', '삼성바이오로직스']

In [23]:
for i in range(len(data_list)):
  if i == 0:
    name = data_list[0]
    data = pd.read_csv('/content/drive/MyDrive/ML/' + name + '.csv', index_col = 'Date') #/content/drive/MyDrive/ML/000270.KS.csv
    data = pd.DataFrame(data[data.columns[0]]) # Open price
    data.columns = [data_name[i]]
    data.index = pd.to_datetime(data.index)

    data_calendar = pd.date_range(start = '2017-02-28', end = '2022-02-28', freq = 'D') # Include COVID-19 period
    data_dummy = pd.DataFrame([0 for i in range(len(data_calendar))])
    data_dummy.index = data_calendar

    data = data_dummy.join(data[data_name[0]])
    data = data.drop([0], axis = 1)

  else:
    name = data_list[i]
    tmp = pd.read_csv('/content/drive/MyDrive/ML/' + name + '.csv', index_col = 'Date')
    tmp = pd.DataFrame(tmp[tmp.columns[4]])
    tmp.columns = [data_name[i]]
    tmp.index = pd.to_datetime(tmp.index)

    data = data.join(tmp[data_name[i]])

In [24]:
data

Unnamed: 0,기아,SK하이닉스,현대차,포스코,삼성전자,삼성SDI,카카오,LG화학,삼성바이오로직스
2017-02-28,38350.0,43463.667969,131274.531250,233702.828125,33358.113281,126753.085938,17008.529297,263705.59375,164500.0000
2017-03-01,,,,,,,,,
2017-03-02,38400.0,44394.367188,130396.453125,240297.593750,34468.906250,126753.085938,16988.611328,267898.84375,161500.0000
2017-03-03,37050.0,43835.945313,124688.882813,237000.171875,34382.121094,122822.757813,17088.193359,262773.78125,162000.0000
2017-03-04,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
2022-02-24,77300.0,122500.000000,173000.000000,278000.000000,71500.000000,516000.000000,90000.000000,549000.00000,752017.3125
2022-02-25,74800.0,123000.000000,174000.000000,275500.000000,71900.000000,539000.000000,94400.000000,554000.00000,760922.8125
2022-02-26,,,,,,,,,
2022-02-27,,,,,,,,,


In [25]:
data = data.dropna(how='all')
data

Unnamed: 0,기아,SK하이닉스,현대차,포스코,삼성전자,삼성SDI,카카오,LG화학,삼성바이오로직스
2017-02-28,38350.0,43463.667969,131274.531250,233702.828125,33358.113281,126753.085938,17008.529297,263705.593750,164500.0000
2017-03-02,38400.0,44394.367188,130396.453125,240297.593750,34468.906250,126753.085938,16988.611328,267898.843750,161500.0000
2017-03-03,37050.0,43835.945313,124688.882813,237000.171875,34382.121094,122822.757813,17088.193359,262773.781250,162000.0000
2017-03-06,36600.0,45930.023438,127323.132813,237824.484375,34781.304688,122822.757813,16689.867188,256716.953125,164500.0000
2017-03-07,36400.0,45418.136719,126445.046875,237412.375000,34885.441406,122822.757813,16669.951172,258114.703125,168000.0000
...,...,...,...,...,...,...,...,...,...
2022-02-22,78700.0,131500.000000,183500.000000,281500.000000,74300.000000,555000.000000,91400.000000,629000.000000,766859.8125
2022-02-23,78700.0,131500.000000,183500.000000,281500.000000,74300.000000,555000.000000,91400.000000,629000.000000,760922.8125
2022-02-24,77300.0,122500.000000,173000.000000,278000.000000,71500.000000,516000.000000,90000.000000,549000.000000,752017.3125
2022-02-25,74800.0,123000.000000,174000.000000,275500.000000,71900.000000,539000.000000,94400.000000,554000.000000,760922.8125


In [26]:
data.isnull().sum()

기아          0
SK하이닉스      0
현대차         0
포스코         0
삼성전자        0
삼성SDI       0
카카오         0
LG화학        0
삼성바이오로직스    0
dtype: int64

In [27]:
# Function: Calculate returns

def cal_return(data, log = True):
  '''
  type(data) = pd.DataFrame
  '''
  tmp = data.copy()
  if log == True: # Log return
    result = np.log(tmp) - np.log(tmp.shift(1))
  else: # Return
    result = tmp.pct_change()
  return result

In [28]:
ret = cal_return(data)
ret = ret['2017-03-01':]
ret

Unnamed: 0,기아,SK하이닉스,현대차,포스코,삼성전자,삼성SDI,카카오,LG화학,삼성바이오로직스
2017-03-02,0.001303,0.021187,-0.006711,0.027828,0.032757,0.000000,-0.001172,0.015776,-0.018405
2017-03-03,-0.035789,-0.012658,-0.044758,-0.013817,-0.002521,-0.031499,0.005845,-0.019316,0.003091
2017-03-06,-0.012220,0.046665,0.020907,0.003472,0.011543,0.000000,-0.023586,-0.023319,0.015314
2017-03-07,-0.005479,-0.011207,-0.006920,-0.001734,0.002990,0.000000,-0.001194,0.005430,0.021053
2017-03-08,0.008208,-0.016529,0.003466,0.006921,0.000000,0.039221,-0.008398,-0.014546,0.011834
...,...,...,...,...,...,...,...,...,...
2022-02-22,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010376
2022-02-23,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.007772
2022-02-24,-0.017949,-0.070896,-0.058923,-0.012511,-0.038414,-0.072861,-0.015436,-0.136033,-0.011773
2022-02-25,-0.032876,0.004073,0.005764,-0.009033,0.005579,0.043609,0.047731,0.009066,0.011773


In [29]:
y = pd.DataFrame()
y.index = pd.to_datetime(ret.index)

for name in data_name:
  col = np.where(ret[name] > 0, 1, np.where(ret[name] == 0, 0, -1))
  y[name] = col

y

Unnamed: 0,기아,SK하이닉스,현대차,포스코,삼성전자,삼성SDI,카카오,LG화학,삼성바이오로직스
2017-03-02,1,1,-1,1,1,0,-1,1,-1
2017-03-03,-1,-1,-1,-1,-1,-1,1,-1,1
2017-03-06,-1,1,1,1,1,0,-1,-1,1
2017-03-07,-1,-1,-1,-1,1,0,-1,1,1
2017-03-08,1,-1,1,1,0,1,-1,-1,1
...,...,...,...,...,...,...,...,...,...
2022-02-22,0,0,0,0,0,0,0,0,1
2022-02-23,0,0,0,0,0,0,0,0,-1
2022-02-24,-1,-1,-1,-1,-1,-1,-1,-1,-1
2022-02-25,-1,1,1,-1,1,1,1,1,1
