In [1]:
import time
import sqlite3
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import FinanceDataReader as fdr
from datetime import date, timedelta, datetime

In [2]:
end_date_dt = date.today()
start_date_dt = end_date_dt - timedelta(days=60)
end_date = end_date_dt.strftime("%Y-%m-%d")
start_date = start_date_dt.strftime("%Y-%m-%d")

start_date, end_date

('2020-07-07', '2020-09-05')

In [3]:
# query financial statement data
conn = sqlite3.connect('../../data/data_v3.1.db')
cur = conn.cursor()

if end_date_dt.month > 5:
    sql = "select * from fs where year=" + str(int(end_date[:4]) - 1) # 작년도의 재무상태표, 손익계산서 데이터를 가져온다. 
else:
    sql = "select * from fs where year=" + str(int(end_date[:4]) - 2) # 재작년도의 재무상태표, 손익계산서 데이터를 가져온다. 
    
cur.execute(sql)
rows = cur.fetchall()
print("Querried rows:", len(rows))

# insert querried data into dataframe
df_company = pd.DataFrame(columns=['날짜', '기업코드', '종목코드', '회사명', '유동자산', '유동부채', '자산총계'])

companies = []
for row in rows:
    company = {}
    company['날짜'] = end_date
    company['기업코드'] = row[1] # corp code
    company['종목코드'] = row[2] # stock code
    company['회사명'] = row[3] # corp name
    company[row[4]] = row[5] # amount
    company['IFRS'] = row[6] # IFRS
    company['CFS'] = row[7] # CFS
    
    companies.append(company)
df_company = df_company.append(companies)

Querried rows: 47709


In [4]:
aggregation_functions = {'종목코드': 'first', '회사명': 'first', '유동자산': 'sum', '유동부채': 'sum', '자산총계': 'sum', '날짜': 'first'}

In [5]:
# 연결 재무제표 기업
df_cfs = df_company[df_company.CFS == 1]
df_cfs = df_cfs.groupby(['기업코드', 'IFRS', 'CFS']).aggregate(aggregation_functions).reset_index()
df_cfs = df_cfs[df_cfs['자산총계'] > 0]
len(df_cfs)

1755

In [6]:
# 개별 재무제표 기업
df_fs = df_company[df_company.CFS == 0]
df_fs = df_fs.groupby(['기업코드', 'IFRS', 'CFS']).aggregate(aggregation_functions).reset_index()
df_fs = df_fs[df_fs['자산총계'] > 0]
len(df_fs)

2252

In [7]:
# CFS가 있는 기업은 FS에서 삭제
cfs_stock_codes = df_cfs['종목코드'].tolist()
df_fs['CFS_EXISTS'] = df_fs.apply(lambda x: x['종목코드'] in cfs_stock_codes, axis=1)
df_fs = df_fs[df_fs['CFS_EXISTS'] == False]
df_fs = df_fs.drop(columns=['CFS_EXISTS'])

# 합체
df_company = pd.concat([df_cfs, df_fs]).reset_index().drop(columns=['index'])
print("Number of companies:", len(df_company))

Number of companies: 2277


In [8]:
df_company['유동비율'] = df_company['유동자산'] / df_company['유동부채']

In [9]:
df_company

Unnamed: 0,기업코드,IFRS,CFS,종목코드,회사명,유동자산,유동부채,자산총계,날짜,유동비율
0,00100258,1.0,1.0,030270,에스마크,2.208000e+10,1.976000e+10,4.587000e+10,2020-09-05,1.117409
1,00100601,1.0,1.0,114190,강원,3.054000e+10,3.084000e+10,5.660206e+10,2020-09-05,0.990272
2,00100939,1.0,1.0,000860,강남제비스코,1.787900e+11,5.991000e+10,6.722703e+11,2020-09-05,2.984310
3,00101044,1.0,1.0,003060,에이프로젠제약,2.135500e+11,1.081000e+10,3.652830e+11,2020-09-05,19.754857
4,00101220,1.0,1.0,001390,KG케미칼,1.441670e+12,1.587760e+12,4.151542e+12,2020-09-05,0.907990
...,...,...,...,...,...,...,...,...,...,...
2272,01396931,1.0,0.0,337840,유엑스엔,2.000000e+09,6.000000e+08,2.900000e+09,2020-09-05,3.333333
2273,01412725,1.0,0.0,336260,두산퓨얼셀,4.084000e+11,2.253000e+11,4.958000e+11,2020-09-05,1.812694
2274,01413371,1.0,0.0,343090,단디바이오,6.300000e+09,5.000000e+08,6.800000e+09,2020-09-05,12.600000
2275,01418260,1.0,0.0,354230,폭스소프트,5.200000e+09,4.400000e+09,6.100000e+09,2020-09-05,1.181818


In [None]:
def get_price_and_stocks(stock_code):
    url = "https://finance.naver.com/item/main.nhn?code=" + stock_code
    response = requests.get(url)
    
    retry = 0
    while response.status_code != 200:
        print(response.status_code, "wait...")
        time.sleep(10)
        response = requests.get(url)
        
        retry += 1
        if retry > 10:
            print("반복 요청 제한")
            return 0, 0

    html = BeautifulSoup(response.text, 'html.parser')

    #print(html)
    error_desc = html.find("p", {"class": "error_desc"})
    if error_desc:
        print("종목 정보 없음")
        return 0, 0

    # 관리종목 여부 확인
    description = html.find("div", {"class": "description"})
    spans = description.find_all("span")
    for span in spans:
        if span.text == '관리종목':
            print("관리종목")
            return 0, 0

    # 주식 수
    aside = html.find("div", {"id": "aside"})
    table = aside.find("div", {"class": "first"})
    trs = table.find_all("tr")

    num_stocks = trs[2].find("td").text
    num_stocks = int(num_stocks.replace(",",""))

    # 가격
    content = html.find("div", {"id": "content"})
    div_today = content.find("div", {"class": "today"})
    price = div_today.find("span").text
    price = int(price.replace(",",""))

    return price, num_stocks

In [None]:
start_time = time.time()

# 과거 60일 가격 정보로 모멘텀 데이터 채우기
df_company['주가'] = np.NaN
df_company['주식수'] = np.NaN
df_company['시가총액'] = np.NaN
df_company['start_price'] = np.NaN
df_company['end_price'] = np.NaN
df_company['yield'] = np.NaN

for i, row in enumerate(df_company.iterrows()):
    if i < 529:
        continue
    
    company = row[1] # company = (corp code, ifrs, cfs, stock code, corp name, ...)
    stock_code = company[3]
    
    print(company[4], stock_code)
    
    # 시가총액
    start_price_time = time.time()
    price, num_stocks = get_price_and_stocks(stock_code)
    market_cap_time = time.time() - start_price_time
    
    market_cap = price * num_stocks
    
    df_company.at[row[0], '주가'] = price
    df_company.at[row[0], '주식수'] = num_stocks
    df_company.at[row[0], '시가총액'] = market_cap
    
    if price == 0: # 네이버 금융에 가격정보가 없거나 현재 관리종목인 경우
        print(i, "no price NAVER:", company[4], stock_code, market_cap_time)
        continue
    else:
        start_price_time = time.time()
        df_price = fdr.DataReader(stock_code, start_date, end_date).reset_index()
        price_time = time.time() - start_price_time
        
        if len(df_price) == 0:
            print(i, "no price FDR:", company[4])
            continue
            
        # 마지막 가격 날짜 확인
        df_price_end_date = df_price.iloc[len(df_price) - 1].Date
        df_price_end_date = datetime.strptime(str(df_price_end_date)[:10], '%Y-%m-%d').date()

        if (end_date_dt - df_price_end_date).days >= 7: # 최근 일주일 내 거래되지 않았으면 제거
            print(i, "최근 거래 없음:", company[4], df_price_end_date)
            continue

        start_price = df_price.iloc[0].Open
        end_price = df_price.iloc[len(df_price) - 1].Close
        stock_yield = end_price / start_price 
        
        df_company.at[row[0], 'start_price'] = start_price
        df_company.at[row[0], 'end_price'] = end_price
        df_company.at[row[0], 'yield'] = stock_yield
        
        print(i, company[4], stock_code, market_cap, stock_yield, market_cap_time, price_time)
        
        time.sleep(0.5)
        
print(time.time() - start_time)

현대공업 170030
529 현대공업 170030 77160200000 1.143181818181818 0.3274390697479248 0.1484081745147705
현대미포조선 010620
530 현대미포조선 010620 1208250007250 0.9512578616352201 0.2682631015777588 0.2711038589477539
HDC 012630
531 HDC 012630 657158931000 1.2672811059907834 0.2812778949737549 1.5264320373535156
HMM 011200
532 HMM 011200 2077979414280 1.2796780684104627 0.3194010257720947 0.2713460922241211
현대엘리베이터 017800
533 현대엘리베이터 017800 1564720840800 0.8015717092337917 0.29540109634399414 0.5192379951477051
현대자동차 005380
534 현대자동차 005380 36857762257500 1.674757281553398 0.4503161907196045 0.292905330657959
SK하이닉스 000660
535 SK하이닉스 000660 57293786125500 0.9014891179839634 0.35817718505859375 0.8869051933288574
현대모비스 012330
536 현대모비스 012330 21862579620000 1.119221411192214 0.3380570411682129 0.3266260623931885
현대종합상사 011760
537 현대종합상사 011760 183882627400 0.9754385964912281 0.29023075103759766 0.28148412704467773
한국조선해양 009540
538 한국조선해양 009540 6093565287600 0.9695945945945946 0.2709782123565674 0.421808

In [None]:
len(df_company)

In [None]:
df_company.to_csv("predict.csv", index=False)