# 한국 주식 데이터를 크롤링합니다.

## 예전에 <심이삭>이 만들어 두었던 것을 참조합니다.
## 링크: https://github.com/gilgarad/stock_predict/blob/master/src/data_crawler/stock_info.py

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from os.path import join, exists, dirname
from os import makedirs

# 크롤링 구현 순서:

# 1. 주가시스템에 있는 회사 리스트 가지고 오기
## 설명: 등록된 회사 리스트와 종목 코드를 수집합니다.

In [8]:
code_df = pd.read_html('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', header=0)[0]
# 종목코드가 6자리이기 때문에 6자리를 맞춰주기 위해 설정해줌
code_df.종목코드 = code_df.종목코드.map('{:06d}'.format)
# 우리가 필요한 것은 회사명과 종목코드이기 때문에 필요없는 column들은 제외해준다.
code_df = code_df[['회사명', '종목코드']]  # 한글로된 컬럼명을 영어로 바꿔준다.
company_info = code_df.rename(columns={'회사명': 'name', '종목코드': 'code'})

In [9]:
company_info

Unnamed: 0,name,code
0,GS글로벌,001250
1,HSD엔진,082740
2,KG케미칼,001390
3,LG이노텍,011070
4,OCI,010060
5,SK네트웍스,001740
6,SK이노베이션,096770
7,STX,011810
8,WISCOM,024070
9,갤럭시아에스엠,011420


# 2. 데이터 수집하기
## 내용: 네이버에서 위에서 수집한 종목코드의 주가 데이터를 수집합니다.

In [18]:
def get_stock_data(company_name, by_year=list(), recent_n_data=list()):
    t = datetime.now()
    print('Crawling Company Name:', company_name)
    
    # 1. 회사 종목코드와 데이터를 수집하기 위한 대상 url을 생성합닏.
    code = company_info.query("name=='{}'".format(company_name))['code'].to_string(index=False)
    url = 'http://finance.naver.com/item/sise_day.nhn?code={code}'.format(code=code)
    print("요청 URL = {}".format(url))

    # 2. (option) 지정된 year 또는 타겟 날짜로부터 가장 최근 n일의 데이터를 수집하고자 할 때 지정하기 위한 로직입니다.
    yrs = ""
    min_year = -1
    target_date = -1
    num_seq = -1

    if len(recent_n_data) == 2:
        target_date = str(recent_n_data[0])
        target_date = target_date[:4] + '.' + target_date[4:6] + '.' + target_date[6:8]  # yyyy.mm.dd
        num_seq = recent_n_data[1]

    if len(by_year) != 0 and target_date == -1:
        yrs = "|".join(str(yr) for yr in by_year)
        min_year = sorted(by_year)[0]
        print('Collecting years... ', by_year)
        # print('Mininum Year: %i' % min_year)

    # 3. UI에 Display된 페이지 데이터를 수집하기 때문에 max page를 정해놓고 시도합니다.
    max_page = 1000
    df = pd.DataFrame()

    for page in range(1, max_page):
        pg_url = '{url}&page={page}'.format(url=url, page=page)
        d = pd.read_html(pg_url, header=0)[0]
        d = d.dropna()

        # 3-1. 마지막 페이지 수집시, 이전 페이지의 중복된 날짜도 함께 수집되었는지 확인하고 프로세스를 멈춥니다.
        if df.shape[0] != 0 and d['날짜'][1] in df['날짜'].tolist():
            # Max page reached
            print('Break page:', page)
            break

        # 3-2. 타겟 날짜에 대한 데이터만 수집할 때에만 해당됩니다.
        if target_date != -1:
            if df.shape[0] == 0:
                d = d[d['날짜'].str.contains(target_date)]
                if d.shape[0] == 0:
                    continue

            df = df.append(d)

            if df.shape[0] >= num_seq:
                df = df.head(num_seq)
                break

        # 3-3. 타겟 year만 수집할 때에만 해당됩니다. 타겟 year 이전 해의 데이터가 수집되었다면 프로세스를 멈춥니다.
        if min_year != -1 and target_date == -1 and min_year > int(d['날짜'][d.shape[0] - 1][:4]):
            print('Found minimum year, break page:', page)
            break

        df = df.append(d, ignore_index=True)

    # 4. 수집된 데이터를 가볍게 가공합니다.
    df['name'] = pd.Series([company_name] * len(df['날짜']), index=df.index)
    df = df.rename(columns={'날짜': 'date', '종가': 'final_price', '전일비': 'compare_to_prior', '시가': 'start_price',
                            '고가': 'highest_price', '저가': 'lowest_price', '거래량': 'num_of_traded'})

    # 5. 수집된 데이터 중 타겟 year에 해당하는 데이터만 재정리 합니다.
    if min_year != -1:
        df = df[df['date'].str.contains(yrs)]

    t = datetime.now() - t
    print('Elapsed time:', t)

    return df

In [19]:
all_data = pd.DataFrame()
collect_start_time = datetime.now()
for idx, company_name in enumerate(company_info['name']):
    print('[%i]' %idx)
    stock_data = get_stock_data(company_name=company_name)
    all_data = pd.concat([all_data, stock_data])
print('Crawling data finished. Total elapsed time:', datetime.now() - collect_start_time)

[0]
Crawling Company Name: GS글로벌
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=001250
Break page: 577
Elapsed time: 0:00:59.333009
[1]
Crawling Company Name: HSD엔진
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=082740
Break page: 207
Elapsed time: 0:00:15.257557
[2]
Crawling Company Name: KG케미칼
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=001390
Break page: 577
Elapsed time: 0:00:48.882805
[3]
Crawling Company Name: LG이노텍
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=011070
Break page: 269
Elapsed time: 0:00:26.519974
[4]
Crawling Company Name: OCI
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=010060
Break page: 577
Elapsed time: 0:00:57.889990
[5]
Crawling Company Name: SK네트웍스
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=001740
Break page: 577
Elapsed time: 0:00:55.899363
[6]
Crawling Company Name: SK이노베이션
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=096770
Break page: 293
Elapsed time: 0:00:23.968411
[7]
C

Break page: 67
Elapsed time: 0:00:07.008079
[58]
Crawling Company Name: 제이에스코퍼레이션
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=194370
Break page: 81
Elapsed time: 0:00:09.137375
[59]
Crawling Company Name: 제일약품
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=271980
Break page: 46
Elapsed time: 0:00:04.971345
[60]
Crawling Company Name: 제일연마
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=001560
Break page: 333
Elapsed time: 0:00:40.974248
[61]
Crawling Company Name: 제일파마홀딩스
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=002620
Break page: 577
Elapsed time: 0:01:21.674658
[62]
Crawling Company Name: 제주항공
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=089590
Break page: 87
Elapsed time: 0:00:10.835987
[63]
Crawling Company Name: 조일알미늄
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=018470
Break page: 574
Elapsed time: 0:01:13.492238
[64]
Crawling Company Name: 조흥
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=002600
Bre

Break page: 519
Elapsed time: 0:01:12.375696
[116]
Crawling Company Name: 뉴프렉스
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=085670
Break page: 331
Elapsed time: 0:00:49.244193
[117]
Crawling Company Name: 대동기어
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=008830
Break page: 519
Elapsed time: 0:01:12.591507
[118]
Crawling Company Name: 대봉엘에스
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=078140
Break page: 332
Elapsed time: 0:00:39.700730
[119]
Crawling Company Name: 대유위니아
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=071460
Break page: 71
Elapsed time: 0:00:07.913114
[120]
Crawling Company Name: 대한광통신
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=010170
Break page: 519
Elapsed time: 0:01:11.778326
[121]
Crawling Company Name: 대한뉴팜
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=054670
Break page: 427
Elapsed time: 0:00:58.673658
[122]
Crawling Company Name: 대한약품
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=0239

Break page: 250
Elapsed time: 0:00:45.597568
[174]
Crawling Company Name: 에이치엔티
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=176440
Break page: 61
Elapsed time: 0:00:06.995950
[175]
Crawling Company Name: 에이티넘인베스트
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=021080
Break page: 519
Elapsed time: 0:01:22.047153
[176]
Crawling Company Name: 에이피티씨
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=089970
Break page: 19
Elapsed time: 0:00:02.170315
[177]
Crawling Company Name: 에코마이스터
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=064510
Break page: 30
Elapsed time: 0:00:03.160913
[178]
Crawling Company Name: 에코마케팅
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=230360
Break page: 69
Elapsed time: 0:00:09.365820
[179]
Crawling Company Name: 에프에스티
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=036810
Break page: 475
Elapsed time: 0:01:04.281838
[180]
Crawling Company Name: 엑셈
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=20

Break page: 60
Elapsed time: 0:00:06.837986
[232]
Crawling Company Name: 프로스테믹스
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=203690
Break page: 115
Elapsed time: 0:00:14.048056
[233]
Crawling Company Name: 피에스엠씨
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=024850
Break page: 454
Elapsed time: 0:01:00.380757
[234]
Crawling Company Name: 피에스케이
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=319660
Break page: 2
Elapsed time: 0:00:00.277272
[235]
Crawling Company Name: 하나금융10호스팩
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=270520
Break page: 43
Elapsed time: 0:00:04.659610
[236]
Crawling Company Name: 하이셈
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=200470
Break page: 109
Elapsed time: 0:00:12.456548
[237]
Crawling Company Name: 한국맥널티
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=222980
Break page: 84
Elapsed time: 0:00:09.717301
[238]
Crawling Company Name: 한국전자금융
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=

KeyboardInterrupt: 

# 3. 수집된 모든 데이터를 pandas로 저장합니다.

In [20]:
all_data.to_csv('/data1/stock_data/2019_05_14_stock_data.csv')
print('Data saved!!') 

Data saved!!


# Extras: #1. 주가 데이터의 여러 feature를 더하는함수입니다.

In [21]:
class Features:

    @staticmethod
    def fnMACD(m_Df, m_NumFast=12, m_NumSlow=26, m_NumSignal=9):
        m_Df['EMAFast'] = m_Df['final_price'].ewm(span=m_NumFast, min_periods=m_NumFast - 1).mean()
        m_Df['EMASlow'] = m_Df['final_price'].ewm(span=m_NumSlow, min_periods=m_NumSlow - 1).mean()
        m_Df['MACD'] = m_Df['EMAFast'] - m_Df['EMASlow']
        m_Df['MACDSignal'] = m_Df['MACD'].ewm(span=m_NumSignal, min_periods=m_NumSignal - 1).mean()
        m_Df['MACDDiff'] = m_Df['MACD'] - m_Df['MACDSignal']
        return m_Df

    @staticmethod
    def fnBolingerBand(m_DF, n=20, k=2):
        m_DF['20d_ma'] = pd.rolling_mean(m_DF['final_price'], window=n)
        m_DF['Bol_upper'] = pd.rolling_mean(m_DF['final_price'], window=n) + k * pd.rolling_std(m_DF['final_price'], n,
                                                                                                min_periods=n)
        m_DF['Bol_lower'] = pd.rolling_mean(m_DF['final_price'], window=n) - k * pd.rolling_std(m_DF['final_price'], n,
                                                                                                min_periods=n)

        return m_DF

    @staticmethod
    def fnRSI(m_Df, m_N=14):
        U = np.where(m_Df['final_price'].diff(1) > 0, m_Df['final_price'].diff(1), 0)
        D = np.where(m_Df['final_price'].diff(1) < 0, m_Df['final_price'].diff(1) * (-1), 0)

        AU = pd.DataFrame(U).rolling(window=m_N, min_periods=m_N).mean()
        AD = pd.DataFrame(D).rolling(window=m_N, min_periods=m_N).mean()
        RSI = AU.div(AD + AU) * 100

        m_Df['RSI'] = RSI
        return m_Df

    @staticmethod
    def fnStoch(m_Df, n=14):  # price: 종가(시간 오름차순), n: 기간
        sz = len(m_Df['final_price'])

        tempSto_K = []
        for i in range(sz):
            if i >= n - 1:
                tempUp = m_Df['final_price'][i] - min(m_Df['lowest_price'][i - n + 1:i + 1])
                tempDown = max(m_Df['highest_price'][i - n + 1:i + 1]) - min(m_Df['lowest_price'][i - n + 1:i + 1])
                tempSto_K.append(tempUp / tempDown)
            else:
                tempSto_K.append(0)  # n보다 작은 초기값은 0 설정
        m_Df['Sto_K'] = pd.Series(tempSto_K, index=m_Df.index)

        m_Df['Sto_D'] = pd.Series(pd.rolling_mean(m_Df['Sto_K'], 3))
        m_Df['Sto_SlowD'] = pd.Series(pd.rolling_mean(m_Df['Sto_D'], 3))

        return m_Df

    @staticmethod
    def fnMA(m_Df, m_N=list(), m_ColumnName='final_price'):
        all_MA = list()
        if m_ColumnName in m_Df.columns:
            for num in m_N:
                MA = pd.Series.rolling(m_Df[m_ColumnName], window=num, center=False).mean()
                m_Df['MA' + str(num)] = MA

                all_MA.append(MA)

            for i in range(len(all_MA)):
                if i + 1 == len(all_MA):
                    break

                for i2 in range(i + 1, len(all_MA)):
                    m_Df['SignalMA' + str(m_N[i]) + '_' + str(m_N[i2])] = all_MA[i] - all_MA[i2]

        else:
            raise ("You didn't input a Column Name")
        return m_Df

    @staticmethod
    def change_prior_to(m_Df):

        m_Df['compare_to_prior'] = m_Df['final_price'].diff(1)
        m_Df['percent'] = (m_Df['final_price'] * 100 / (m_Df['final_price'] - m_Df['compare_to_prior']) - 100).round(2)

        return m_Df

# Extras: 일련의 과정을 class화 합니다.

In [35]:
class StockCollector:
    def __init__(self):
        self.company_info = self._retrieve_company_info()
        
    def _retrieve_company_info(self):
        code_df = pd.read_html('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', header=0)[0]
        # 종목코드가 6자리이기 때문에 6자리를 맞춰주기 위해 설정해줌
        code_df.종목코드 = code_df.종목코드.map('{:06d}'.format)
        # 우리가 필요한 것은 회사명과 종목코드이기 때문에 필요없는 column들은 제외해준다.
        code_df = code_df[['회사명', '종목코드']]  # 한글로된 컬럼명을 영어로 바꿔준다.
        company_info = code_df.rename(columns={'회사명': 'name', '종목코드': 'code'})
        return company_info
    
    def get_stock_data_by_company(self, company_name, by_year=list(), recent_n_data=list()):
        t = datetime.now()
        print('Crawling Company Name:', company_name)

        # 1. 회사 종목코드와 데이터를 수집하기 위한 대상 url을 생성합닏.
        code = self.company_info.query("name=='{}'".format(company_name))['code'].to_string(index=False)
        url = 'http://finance.naver.com/item/sise_day.nhn?code={code}'.format(code=code)
        print("요청 URL = {}".format(url))

        # 2. (option) 지정된 year 또는 타겟 날짜로부터 가장 최근 n일의 데이터를 수집하고자 할 때 지정하기 위한 로직입니다.
        yrs = ""
        min_year = -1
        target_date = -1
        num_seq = -1

        if len(recent_n_data) == 2:
            target_date = str(recent_n_data[0])
            target_date = target_date[:4] + '.' + target_date[4:6] + '.' + target_date[6:8]  # yyyy.mm.dd
            num_seq = recent_n_data[1]

        if len(by_year) != 0 and target_date == -1:
            yrs = "|".join(str(yr) for yr in by_year)
            min_year = sorted(by_year)[0]
            print('Collecting years... ', by_year)
            # print('Mininum Year: %i' % min_year)

        # 3. UI에 Display된 페이지 데이터를 수집하기 때문에 max page를 정해놓고 시도합니다.
        max_page = 1000
        df = pd.DataFrame()

        for page in range(1, max_page):
            pg_url = '{url}&page={page}'.format(url=url, page=page)
            d = pd.read_html(pg_url, header=0)[0]
            d = d.dropna()

            # 3-1. 마지막 페이지 수집시, 이전 페이지의 중복된 날짜도 함께 수집되었는지 확인하고 프로세스를 멈춥니다.
            if df.shape[0] != 0 and len(d['날짜']) > 1 and d['날짜'][1] in df['날짜'].tolist():
                # Max page reached
                print('Break page:', page)
                break

            # 3-2. 타겟 날짜에 대한 데이터만 수집할 때에만 해당됩니다.
            if target_date != -1:
                if df.shape[0] == 0:
                    d = d[d['날짜'].str.contains(target_date)]
                    if d.shape[0] == 0:
                        continue

                df = df.append(d)

                if df.shape[0] >= num_seq:
                    df = df.head(num_seq)
                    break

            # 3-3. 타겟 year만 수집할 때에만 해당됩니다. 타겟 year 이전 해의 데이터가 수집되었다면 프로세스를 멈춥니다.
            if min_year != -1 and target_date == -1 and min_year > int(d['날짜'][d.shape[0] - 1][:4]):
                print('Found minimum year, break page:', page)
                break

            df = df.append(d, ignore_index=True)

        # 4. 수집된 데이터를 가볍게 가공합니다.
        df['name'] = pd.Series([company_name] * len(df['날짜']), index=df.index)
        df = df.rename(columns={'날짜': 'date', '종가': 'final_price', '전일비': 'compare_to_prior', '시가': 'start_price',
                                '고가': 'highest_price', '저가': 'lowest_price', '거래량': 'num_of_traded'})

        # 5. 수집된 데이터 중 타겟 year에 해당하는 데이터만 재정리 합니다.
        if min_year != -1:
            df = df[df['date'].str.contains(yrs)]

        t = datetime.now() - t
        print('Elapsed time:', t)

        return df
    
    def start_data_collection(self, save_path, autosave=True, save_by_company=True):
        all_data = pd.DataFrame()
        collect_start_time = datetime.now()
        today_date = datetime.today().strftime('%Y%m%d')
        save_full_path = join(save_path, today_date + '_stock_data.csv')
        for idx, company_name in enumerate(company_info['name']):
            if idx < 1151 or idx == 1176:
                continue
            print('[%i]' %idx)
            stock_data = self.get_stock_data_by_company(company_name=company_name)
            all_data = pd.concat([all_data, stock_data])
            
            if autosave and idx % 50 == 0:
                all_data.to_csv(save_full_path)
                
        all_data.to_csv(join(save_path, today_date + '_stock_data.csv'))
        print('Crawling data finished. Total elapsed time:', datetime.now() - collect_start_time)
        if save_by_company:
            self.split_data_by_company(input_path=save_full_path, output_path=save_path)
        
    def split_data_by_company(self, input_path, output_path):

        df = pd.read_csv(input_path, index_col=0)

        if not exists(output_path):
            makedirs(output_path)

        for idx, company_name in enumerate(df.name.unique()):
            print(idx, company_name)
            a = df.loc[df['name'] == company_name]
            a = a.loc[:, a.columns != 'name']
            np.save(join(output_path, company_name), a)

        print('All data saved by company name. Length of company:', len(df.name.unique()))

    def add_features(self, input_path=None, output_path=None, df=None):

        if df is None:
            df = pd.read_csv(input_path)

        df_new = pd.DataFrame()
        for idx, name in enumerate(df.name.unique()):
            print(idx, name)
            try:
                df_temp = df[df['name'] == name].sort_values(by=['date']).reset_index(drop=True)
                df_temp = Features.fnMACD(df_temp)
                df_temp = Features.fnBolingerBand(df_temp)
                df_temp = Features.fnRSI(df_temp)
                df_temp = Features.fnStoch(df_temp)
                df_temp = Features.change_prior_to(df_temp)
                df_temp = Features.fnMA(df_temp, m_N=[5, 20, 60, 120, 240])
            except Exception as e:
                print('Error occurred while adding features at index %i %s' % (idx, name))
                print(e)
                continue
            df_new = pd.concat([df_new, df_temp])

        df = df_new.reset_index(drop=True)

        if output_path is not None:
            if not exists(dirname(output_path)):
                makedirs(dirname(output_path))
            df.to_csv(output_path)

        return df

In [None]:
stock_collector = StockCollector()
stock_collector.start_data_collection(save_path='/data1/stock_data', autosave=True, save_by_company=False)

[1151]
Crawling Company Name: 동원F&B
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=049770
Break page: 458
Elapsed time: 0:01:16.048271
[1152]
Crawling Company Name: 두산밥캣
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=241560
Break page: 62
Elapsed time: 0:00:12.028806
[1153]
Crawling Company Name: 두산중공업
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=034020
Break page: 460
Elapsed time: 0:01:11.628296
[1154]
Crawling Company Name: 드림텍
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=192650
Break page: 6
Elapsed time: 0:00:00.651931
[1155]
Crawling Company Name: 디아이동일
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=001530
Break page: 577
Elapsed time: 0:01:41.036808
[1156]
Crawling Company Name: 롯데제과
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=280360
Break page: 39
Elapsed time: 0:00:03.853182
[1157]
Crawling Company Name: 만호제강
요청 URL = http://finance.naver.com/item/sise_day.nhn?code=001080
Break page: 577
Elapsed time: 0:01:20.0