# ```stockPrice.ipynb```
- KRX KOSPI200의 구성 종목들에 대한 파일
- 아래 파일의 데이터를 수집, 전처리한 파일
  - ```components_list.csv```
  - ```stockPrice.csv```

# import

In [1]:
import os
import sys
import time
import pickle
import warnings
import urllib.request
from glob import glob

import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup as bs

import FinanceDataReader as fdr
from tqdm import tqdm

pd.options.display.max_columns = None

sys.path.append("../import")
import module as m

data_path = m.data_path
fp_stock = f"""{m.fp["stockPrice"]}"""
fp_components = f"""{m.fp["components"]}"""

data_path : ../data/
fp
{'components': '../data/components_list.csv',
 'esgRating': '../data/esgRating.parquet',
 'finaStat': '../data/finaStat.parquet',
 'indexPrice': '../data/indexPrice.parquet',
 'stockPrice': '../data/stockPrice.parquet'}


# ```components_list.csv```
  - 개요
    - KRX의 KOSPI200지수 구성 종목 리스트
      - components : 구성 요소를 의미
    - 분석의 대상을 추려 내기 위함.
  - 설명
    - 2010-06-31~2022-06-31 기간
    - 매년 06월 말일과 12월 말일에 대한 데이터
  - Data Collection Method
    - 직접 다운로드
    - Data Source
      - [KRX (만료된 URL)](http://index.krx.co.kr/contents/MKD/03/0304/03040101/MKD03040101.jsp?idxCd=1028&upmidCd=0102#a110dc6b3a1678330158473e0d0ffbf0=3)
      - [KRX Market Data System](http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201010106)

## Data Load

In [2]:
list_file = glob(f"{data_path}KRX_KOSPI200_components/KRX_KOSPI200_components*.csv")
print(list_file)

['../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20100630.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20101230.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20110630.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20111230.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20120629.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20121228.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20130628.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20131230.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20140630.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20141230.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20150630.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20151230.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_components_20160630.csv', '../data/KRX_KOSPI200_components\\KRX_KOSPI200_com

## df_components로 병합

In [3]:
df_components = []
for f in list_file:
    new = pd.read_csv(f, encoding="cp949")
    df_components.append(new)

# 병합
df_components = pd.concat(df_components)
# 확인
df_components

Unnamed: 0,종목코드,종목명,종가,대비,등락률,상장시가총액
0,5930,삼성전자,1058000,-3000,-0.28,155842699.0
1,5380,현대차,213000,0,0.00,46918890.0
2,5490,POSCO,380000,-5500,-1.43,33130997.0
3,12330,현대모비스,292000,1500,0.52,28424408.0
4,270,기아차,66700,500,0.76,26934445.0
...,...,...,...,...,...,...
196,34120,SBS,20000,400,2.04,365052.0
197,64960,S&T모티브,27050,-850,-3.05,395556.0
198,33920,무학,13600,200,1.49,387600.0
199,97230,한진중공업,1655,15,0.91,175517.0


## 전처리
- df_components 전처리

In [4]:
# 중복 제거
df_components = df_components.drop_duplicates(["종목코드"], keep="last")
# 필요한 컬럼만
df_components = df_components[["종목코드", "종목명"]]
# 정렬
df_components = df_components.sort_values(by="종목코드", ascending=True)
# 종목코드 6자리로 반환하기
df_components["종목코드"] = df_components["종목코드"].apply(m.six_digit)
# 확인
df_components

Unnamed: 0,종목코드,종목명
179,000020,동화약품
27,000030,우리은행
189,000050,경방
162,000070,삼양홀딩스
128,000080,하이트진로
...,...,...
56,271560,오리온
70,282330,BGF리테일
156,285130,SK케미칼
100,294870,HDC현대산업개발


## 영속화
- df_components
- KRX_KOSPI200_components.csv

In [5]:
m.DfPrst(df_components, fp_components)

['../data/components_list.csv']


# ```stockPrice.csv```
  - 개요
    - KRX KOSPI200 구성종목 주식가격 데이터
  - Data Collection Method
    - [FinanceDataReader](https://github.com/financedata-org/FinanceDataReader)

## Data Load

In [6]:
df_components = m.DataLoad(fp_components)

Mem. usage decreased to  0.00 Mb (24.3% reduction)
df["종목코드"].apply(six_digit)


[1m┌▣ [4mdf.shape[0m ---- ---- ---- ----
(299, 2)


[1m┌▣ [4mdf.info()[0m ---- ---- ---- ----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   종목코드    299 non-null    object
 1   종목명     299 non-null    object
dtypes: object(2)
memory usage: 4.8+ KB
None


[1m┌▣ [4mdf.head()[0m ---- ---- ---- ----


Unnamed: 0,종목코드,종목명
0,20,동화약품
1,30,우리은행
2,50,경방
3,70,삼양홀딩스
4,80,하이트진로




[1m┌▣ [4mdf.columns.to_list()[0m ---- ---- ---- ----
['종목코드', '종목명']


## 코스피200 종목의 주가정보 가져오기

In [7]:
# krx 상장종목 모두 가져오기
df_krx = fdr.StockListing("KRX")
df_krx.head(2)

Unnamed: 0,Code,ISU_CD,Name,Market,Dept,Close,ChangeCode,Changes,ChagesRatio,Open,High,Low,Volume,Amount,Marcap,Stocks,MarketId
0,5930,KR7005930003,삼성전자,KOSPI,,63800,1,300,0.47,63900,64000,63000,15194598,967336146677,380872126690000,5969782550,STK
1,373220,KR7373220003,LG에너지솔루션,KOSPI,,535000,1,2000,0.38,533000,541000,530000,311193,166315331500,125190000000000,234000000,STK


In [8]:
# krx 전체 상장 종목중 KOSPI 상장 종목 추출하기
df_kospi = df_krx[df_krx["Market"] == "KOSPI"]
df_kospi.head(2)

Unnamed: 0,Code,ISU_CD,Name,Market,Dept,Close,ChangeCode,Changes,ChagesRatio,Open,High,Low,Volume,Amount,Marcap,Stocks,MarketId
0,5930,KR7005930003,삼성전자,KOSPI,,63800,1,300,0.47,63900,64000,63000,15194598,967336146677,380872126690000,5969782550,STK
1,373220,KR7373220003,LG에너지솔루션,KOSPI,,535000,1,2000,0.38,533000,541000,530000,311193,166315331500,125190000000000,234000000,STK


In [9]:
# KOSPI 200 기업 추출하기
df_kospi200 = df_kospi[df_kospi["Code"].isin(df_components["종목코드"].values)]
df_kospi200 = df_kospi200.sort_values(by="Code", ascending=True)
df_kospi200.head(2)

Unnamed: 0,Code,ISU_CD,Name,Market,Dept,Close,ChangeCode,Changes,ChagesRatio,Open,High,Low,Volume,Amount,Marcap,Stocks,MarketId
735,20,KR7000020008,동화약품,KOSPI,,9400,2,-80,-0.84,9470,9500,9390,52676,496658090,262555818000,27931470,STK
653,50,KR7000050005,경방,KOSPI,,10950,1,70,0.64,10920,10980,10830,4765,52025920,300197206500,27415270,STK


In [10]:
# 코스피 200 기업 종목코드 추출
list_code = df_kospi200["Code"].values.tolist()
print(len(list_code))

# 코스피 200 기업 종목명 추출
list_name = df_kospi200["Name"].values.tolist()
print(len(list_name))

273
273


In [11]:
# 모든 종목의 주가를 저장
df = []
for i in tqdm(range(len(list_code))):
    stock = fdr.DataReader(list_code[i], "2011")
    df.append(stock)

# 데이터프레임에 종목코드 추가
for i in tqdm(range(len(list_code))):
    df[i]["Code"] = list_code[i]

# 데이터프레임에 종목명 추가
for i in tqdm(range(len(list_name))):
    df[i]["Name"] = list_name[i]

# 모든 데이터프레임을 병합
df_stock_raw = pd.concat(df)

# 인덱스를 제거하고 컬럼에 추가
df_stock_raw = df_stock_raw.reset_index()

# (선택) 실행 시간이 오래걸려서 백업
with open("df_stock_raw.pickle", "wb") as f:
    pickle.dump(df_stock_raw, f)

# 실행 시간이 오래걸려서 백업
df_stock = df_stock_raw.copy()

# 병합한 데이터를 미리보기
df_stock.sample(5)

100%|██████████| 273/273 [01:10<00:00,  3.85it/s]
100%|██████████| 273/273 [00:00<00:00, 5183.17it/s]
100%|██████████| 273/273 [00:00<00:00, 4442.46it/s]


Unnamed: 0,Date,Open,High,Low,Close,Volume,Change,Code,Name
113193,2022-11-28,14950,15100,14700,15050,44309,-0.006601,1800,오리온홀딩스
140741,2013-10-01,413500,428000,410000,418500,1855,0.012092,2960,한국쉘석유
256111,2022-05-31,9500,9500,9380,9480,10628,0.004237,5740,크라운해태홀딩스
708677,2018-11-22,19791,20015,19289,19290,343971,-0.014257,97230,HJ중공업
639502,2021-12-29,83200,83200,80800,81300,373801,-0.049123,71050,한국금융지주


In [13]:
# (선택) 백업한 피클 불러오기
with open("df_stock_raw.pickle", "rb") as f:
    df_stock = pickle.load(f)
df_stock

Unnamed: 0,Date,Open,High,Low,Close,Volume,Change,Code,Name
0,2011-01-03,5320,5370,5220,5280,78725,-0.001890,000020,동화약품
1,2011-01-04,5250,5330,5230,5300,58101,0.003788,000020,동화약품
2,2011-01-05,5300,5350,5220,5240,78870,-0.011321,000020,동화약품
3,2011-01-06,5240,5250,5070,5110,174303,-0.024809,000020,동화약품
4,2011-01-07,5110,5150,5080,5090,36832,-0.003914,000020,동화약품
...,...,...,...,...,...,...,...,...,...
786966,2023-01-30,69500,70500,68400,69500,76977,0.004335,298040,효성중공업
786967,2023-01-31,69400,72100,68500,71800,113207,0.033094,298040,효성중공업
786968,2023-02-01,70900,71400,68300,70100,139363,-0.023677,298040,효성중공업
786969,2023-02-02,70700,70700,68600,69500,114559,-0.008559,298040,효성중공업


## 전처리

In [14]:
# 컬럼명 변경하기
df_stock.columns = ["연_월_일", "시가", "고가", "저가", "종가", "거래량", "등락률", "종목코드", "종목명"]
# 컬럼 순서
list_colOrder = ["종목코드", "종목명", "연_월_일", "시가", "고가", "저가", "종가", "거래량", "등락률"]
df_stock = df_stock[list_colOrder]
df_stock.head(2)

Unnamed: 0,종목코드,종목명,연_월_일,시가,고가,저가,종가,거래량,등락률
0,20,동화약품,2011-01-03,5320,5370,5220,5280,78725,-0.00189
1,20,동화약품,2011-01-04,5250,5330,5230,5300,58101,0.003788


In [15]:
m.DerivedCol_Date(df_stock, col_YMD="연_월_일", inplace=True)

inplace : True


Unnamed: 0,종목코드,종목명,연_월_일,시가,고가,저가,종가,거래량,등락률,연,분기,월,연_분기,연_월,분기_월,연_분기_월,일,월_일
0,000020,동화약품,2011-01-03,5320,5370,5220,5280,78725,-0.001890,2011,1,1,2011-1,2011-1,1-1,2011-1-1,3,1-3
1,000020,동화약품,2011-01-04,5250,5330,5230,5300,58101,0.003788,2011,1,1,2011-1,2011-1,1-1,2011-1-1,4,1-4
2,000020,동화약품,2011-01-05,5300,5350,5220,5240,78870,-0.011321,2011,1,1,2011-1,2011-1,1-1,2011-1-1,5,1-5
3,000020,동화약품,2011-01-06,5240,5250,5070,5110,174303,-0.024809,2011,1,1,2011-1,2011-1,1-1,2011-1-1,6,1-6
4,000020,동화약품,2011-01-07,5110,5150,5080,5090,36832,-0.003914,2011,1,1,2011-1,2011-1,1-1,2011-1-1,7,1-7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786966,298040,효성중공업,2023-01-30,69500,70500,68400,69500,76977,0.004335,2023,1,1,2023-1,2023-1,1-1,2023-1-1,30,1-30
786967,298040,효성중공업,2023-01-31,69400,72100,68500,71800,113207,0.033094,2023,1,1,2023-1,2023-1,1-1,2023-1-1,31,1-31
786968,298040,효성중공업,2023-02-01,70900,71400,68300,70100,139363,-0.023677,2023,1,2,2023-1,2023-2,1-2,2023-1-2,1,2-1
786969,298040,효성중공업,2023-02-02,70700,70700,68600,69500,114559,-0.008559,2023,1,2,2023-1,2023-2,1-2,2023-1-2,2,2-2


In [17]:
# s = df_stock["연"].astype(int)
s = df_stock["연"]
df_stock = df_stock[(s >= 2010) & (s <= 2018)]
df_stock

Unnamed: 0,종목코드,종목명,연_월_일,시가,고가,저가,종가,거래량,등락률,연,분기,월,연_분기,연_월,분기_월,연_분기_월,일,월_일
0,000020,동화약품,2011-01-03,5320,5370,5220,5280,78725,-0.001890,2011,1,1,2011-1,2011-1,1-1,2011-1-1,3,1-3
1,000020,동화약품,2011-01-04,5250,5330,5230,5300,58101,0.003788,2011,1,1,2011-1,2011-1,1-1,2011-1-1,4,1-4
2,000020,동화약품,2011-01-05,5300,5350,5220,5240,78870,-0.011321,2011,1,1,2011-1,2011-1,1-1,2011-1-1,5,1-5
3,000020,동화약품,2011-01-06,5240,5250,5070,5110,174303,-0.024809,2011,1,1,2011-1,2011-1,1-1,2011-1-1,6,1-6
4,000020,동화약품,2011-01-07,5110,5150,5080,5090,36832,-0.003914,2011,1,1,2011-1,2011-1,1-1,2011-1-1,7,1-7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785955,298040,효성중공업,2018-12-21,46700,46750,43350,44100,142861,-0.049569,2018,4,12,2018-4,2018-12,4-12,2018-4-12,21,12-21
785956,298040,효성중공업,2018-12-24,44100,44400,43250,43950,23168,-0.003401,2018,4,12,2018-4,2018-12,4-12,2018-4-12,24,12-24
785957,298040,효성중공업,2018-12-26,43050,43750,41300,42350,46627,-0.036405,2018,4,12,2018-4,2018-12,4-12,2018-4-12,26,12-26
785958,298040,효성중공업,2018-12-27,42350,42850,41550,42300,26867,-0.001181,2018,4,12,2018-4,2018-12,4-12,2018-4-12,27,12-27


### MinMaxScaling
- 일반적인 MinMaxScaling은 컬럼의 Min과 Max를 기준으로 스케일링되지만
- 이 분석의 경우에는 적절하지 못하므로 개별 종목의 Min과 Max를 기준으로 스케일링을 진행함.
- 액면분할과 액면병합을 고려하지 않아 한계점이 존재함.

In [18]:
df_stock = m.DerivedCol_Groupby_MinMaxScaler(df_stock, ["종목코드", "종목명"],["시가", "종가", "거래량"])
df_stock

Unnamed: 0,종목코드,종목명,연_월_일,시가,고가,저가,종가,거래량,등락률,연,분기,월,연_분기,연_월,분기_월,연_분기_월,일,월_일,시가_mmscl,종가_mmscl,거래량_mmscl
0,000020,동화약품,2011-01-03,5320,5370,5220,5280,78725,-0.001890,2011,1,1,2011-1,2011-1,1-1,2011-1-1,3,1-3,0.141775,0.133260,0.015814
1,000020,동화약품,2011-01-04,5250,5330,5230,5300,58101,0.003788,2011,1,1,2011-1,2011-1,1-1,2011-1-1,4,1-4,0.134199,0.135463,0.011268
2,000020,동화약품,2011-01-05,5300,5350,5220,5240,78870,-0.011321,2011,1,1,2011-1,2011-1,1-1,2011-1-1,5,1-5,0.139610,0.128855,0.015846
3,000020,동화약품,2011-01-06,5240,5250,5070,5110,174303,-0.024809,2011,1,1,2011-1,2011-1,1-1,2011-1-1,6,1-6,0.133117,0.114537,0.036882
4,000020,동화약품,2011-01-07,5110,5150,5080,5090,36832,-0.003914,2011,1,1,2011-1,2011-1,1-1,2011-1-1,7,1-7,0.119048,0.112335,0.006580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785955,298040,효성중공업,2018-12-21,46700,46750,43350,44100,142861,-0.049569,2018,4,12,2018-4,2018-12,4-12,2018-4-12,21,12-21,0.283843,0.174468,0.086669
785956,298040,효성중공업,2018-12-24,44100,44400,43250,43950,23168,-0.003401,2018,4,12,2018-4,2018-12,4-12,2018-4-12,24,12-24,0.170306,0.168085,0.005966
785957,298040,효성중공업,2018-12-26,43050,43750,41300,42350,46627,-0.036405,2018,4,12,2018-4,2018-12,4-12,2018-4-12,26,12-26,0.124454,0.100000,0.021784
785958,298040,효성중공업,2018-12-27,42350,42850,41550,42300,26867,-0.001181,2018,4,12,2018-4,2018-12,4-12,2018-4-12,27,12-27,0.093886,0.097872,0.008460


## 영속화

In [20]:
m.DfPrst(df_stock, fp_stock)

['../data/stockPrice.parquet']
