In [32]:
import time
import sqlite3
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import FinanceDataReader as fdr
from datetime import date, timedelta, datetime

In [33]:
end_date_dt = date.today()
start_date_dt = end_date_dt - timedelta(days=60)
end_date = end_date_dt.strftime("%Y-%m-%d")
start_date = start_date_dt.strftime("%Y-%m-%d")

start_date, end_date

('2020-07-08', '2020-09-06')

In [34]:
# query financial statement data
conn = sqlite3.connect('../../data/data_v3.1.db')
cur = conn.cursor()

if end_date_dt.month > 5:
    sql = "select * from fs where year=" + str(int(end_date[:4]) - 1) # 작년도의 재무상태표, 손익계산서 데이터를 가져온다. 
else:
    sql = "select * from fs where year=" + str(int(end_date[:4]) - 2) # 재작년도의 재무상태표, 손익계산서 데이터를 가져온다. 
    
cur.execute(sql)
rows = cur.fetchall()
print("Querried rows:", len(rows))

# insert querried data into dataframe
df_company = pd.DataFrame(columns=['날짜', '기업코드', '종목코드', '회사명', '유동자산', '유동부채', '자산총계'])

companies = []
for row in rows:
    company = {}
    company['날짜'] = end_date
    company['기업코드'] = row[1] # corp code
    company['종목코드'] = row[2] # stock code
    company['회사명'] = row[3] # corp name
    company[row[4]] = row[5] # amount
    company['IFRS'] = row[6] # IFRS
    company['CFS'] = row[7] # CFS
    
    companies.append(company)
df_company = df_company.append(companies)

Querried rows: 47709


In [35]:
aggregation_functions = {'종목코드': 'first', '회사명': 'first', '유동자산': 'sum', '유동부채': 'sum', '자산총계': 'sum', '날짜': 'first'}

In [36]:
# 연결 재무제표 기업
df_cfs = df_company[df_company.CFS == 1]
df_cfs = df_cfs.groupby(['기업코드', 'IFRS', 'CFS']).aggregate(aggregation_functions).reset_index()
df_cfs = df_cfs[df_cfs['자산총계'] > 0]
len(df_cfs)

1755

In [37]:
# 개별 재무제표 기업
df_fs = df_company[df_company.CFS == 0]
df_fs = df_fs.groupby(['기업코드', 'IFRS', 'CFS']).aggregate(aggregation_functions).reset_index()
df_fs = df_fs[df_fs['자산총계'] > 0]
len(df_fs)

2252

In [38]:
# CFS가 있는 기업은 FS에서 삭제
cfs_stock_codes = df_cfs['종목코드'].tolist()
df_fs['CFS_EXISTS'] = df_fs.apply(lambda x: x['종목코드'] in cfs_stock_codes, axis=1)
df_fs = df_fs[df_fs['CFS_EXISTS'] == False]
df_fs = df_fs.drop(columns=['CFS_EXISTS'])

# 합체
df_company = pd.concat([df_cfs, df_fs]).reset_index().drop(columns=['index'])
print("Number of companies:", len(df_company))

Number of companies: 2277


In [39]:
df_company['유동비율'] = df_company['유동자산'] / df_company['유동부채']

In [40]:
df_company

Unnamed: 0,기업코드,IFRS,CFS,종목코드,회사명,유동자산,유동부채,자산총계,날짜,유동비율
0,00100258,1.0,1.0,030270,에스마크,2.208000e+10,1.976000e+10,4.587000e+10,2020-09-06,1.117409
1,00100601,1.0,1.0,114190,강원,3.054000e+10,3.084000e+10,5.660206e+10,2020-09-06,0.990272
2,00100939,1.0,1.0,000860,강남제비스코,1.787900e+11,5.991000e+10,6.722703e+11,2020-09-06,2.984310
3,00101044,1.0,1.0,003060,에이프로젠제약,2.135500e+11,1.081000e+10,3.652830e+11,2020-09-06,19.754857
4,00101220,1.0,1.0,001390,KG케미칼,1.441670e+12,1.587760e+12,4.151542e+12,2020-09-06,0.907990
...,...,...,...,...,...,...,...,...,...,...
2272,01396931,1.0,0.0,337840,유엑스엔,2.000000e+09,6.000000e+08,2.900000e+09,2020-09-06,3.333333
2273,01412725,1.0,0.0,336260,두산퓨얼셀,4.084000e+11,2.253000e+11,4.958000e+11,2020-09-06,1.812694
2274,01413371,1.0,0.0,343090,단디바이오,6.300000e+09,5.000000e+08,6.800000e+09,2020-09-06,12.600000
2275,01418260,1.0,0.0,354230,폭스소프트,5.200000e+09,4.400000e+09,6.100000e+09,2020-09-06,1.181818


In [41]:
def get_price_and_stocks(stock_code):
    url = "https://finance.naver.com/item/main.nhn?code=" + stock_code
    response = requests.get(url)
    
    retry = 0
    while response.status_code != 200:
        print(response.status_code, "대기...")
        time.sleep(10)
        response = requests.get(url)
        
        retry += 1
        if retry > 12:
            print("반복 요청 제한")
            return np.NaN, np.NaN

    html = BeautifulSoup(response.text, 'html.parser')
    
    # 종목 페이지가 네이버 금융에 있는지 확인
    error_desc = html.find("p", {"class": "error_desc"})
    if error_desc:
        print("종목 정보 없음")
        return np.NaN, np.NaN

    # 코스피, 코스닥, 코넥스 확인
    description = html.find("div", {"class": "description"})
    market_img = description.find_all("img")[0]
    if market_img['alt'] not in ['코스닥', '코스피']:
        print("코스닥, 코스피 종목 아님:", market_img['alt'])
        return np.NaN, np.NaN
    
    # 관리종목 여부 확인
    spans = description.find_all("span")
    for span in spans:
        if span.text == '관리종목':
            print("관리종목")
            return np.NaN, np.NaN

    # 주식 수
    aside = html.find("div", {"id": "aside"})
    table = aside.find("div", {"class": "first"})
    trs = table.find_all("tr")
    
    num_stocks = 0
    for tr in trs:
        if tr.find("th").text == '상장주식수':
            num_stocks = tr.find("td").text
            num_stocks = int(num_stocks.replace(",",""))

    # 가격
    content = html.find("div", {"id": "content"})
    div_today = content.find("div", {"class": "today"})
    price = div_today.find("span").text
    price = int(price.replace(",",""))

    return price, num_stocks

In [30]:
#price, num_stocks = get_price_and_stocks("088260")

In [43]:
# 과거 60일 가격 정보로 모멘텀 데이터 채우기
df_company['주가'] = np.NaN
df_company['주식수'] = np.NaN
df_company['시가총액'] = np.NaN
df_company['start_price'] = np.NaN
df_company['end_price'] = np.NaN
df_company['yield'] = np.NaN

In [45]:
start_time = time.time()
for i, row in enumerate(df_company.iterrows()):
#     if i < 865:
#         continue
    
    company = row[1] # company = (corp code, ifrs, cfs, stock code, corp name, ...)
    stock_code = company[3]
    
    print("--------", i, company[4], stock_code)
    
    # 시가총액
    start_price_time = time.time()
    price, num_stocks = get_price_and_stocks(stock_code)
    market_cap_time = time.time() - start_price_time
    
    market_cap = price * num_stocks
    
    df_company.at[row[0], '주가'] = price
    df_company.at[row[0], '주식수'] = num_stocks
    df_company.at[row[0], '시가총액'] = market_cap
    
    if price == np.NaN: # 네이버 금융에 가격정보가 없거나 현재 관리종목인 경우
        continue
    else:
        start_price_time = time.time()
        df_price = fdr.DataReader(stock_code, start_date, end_date).reset_index()
        price_time = time.time() - start_price_time
        
        if len(df_price) == 0:
            print("no price FDR")
            continue
            
        # 마지막 가격 날짜 확인
        df_price_end_date = df_price.iloc[len(df_price) - 1].Date
        df_price_end_date = datetime.strptime(str(df_price_end_date)[:10], '%Y-%m-%d').date()

        if (end_date_dt - df_price_end_date).days >= 7: # 최근 일주일 내 거래되지 않았으면 제거
            print("최근 거래 없음 / 마지막 거래일:", df_price_end_date)
            continue

        start_price = df_price.iloc[0].Open
        end_price = df_price.iloc[len(df_price) - 1].Close
        stock_yield = end_price / start_price 
        
        df_company.at[row[0], 'start_price'] = start_price
        df_company.at[row[0], 'end_price'] = end_price
        df_company.at[row[0], 'yield'] = stock_yield
        
        print("시총:", market_cap, "수익률:", stock_yield, "쿼리 시간:", market_cap_time, price_time)
        
        time.sleep(0.5)
        
print(time.time() - start_time)

-------- 0 에스마크 030270
종목 정보 없음
-------- 1 강원 114190
관리종목
-------- 2 강남제비스코 000860
시총: 100425000000 수익률: 1.0842105263157895 쿼리 시간: 0.3070693016052246 0.5772249698638916
-------- 3 에이프로젠제약 003060
시총: 615799558470 수익률: 0.8704225352112676 쿼리 시간: 0.35114407539367676 0.6210949420928955
-------- 4 KG케미칼 001390
시총: 296701689600 수익률: 1.7014925373134329 쿼리 시간: 0.3049018383026123 0.34419727325439453
-------- 5 경남에너지 008020
종목 정보 없음
-------- 6 경농 002100
시총: 331883775000 수익률: 1.2142857142857142 쿼리 시간: 0.2806510925292969 0.5640890598297119
-------- 7 경동나비엔 009450
시총: 675194030000 수익률: 1.1336898395721926 쿼리 시간: 0.27596592903137207 0.5185158252716064
-------- 8 경동제약 011040
시총: 380992500000 수익률: 1.580396475770925 쿼리 시간: 0.2780270576477051 0.27577805519104004
-------- 9 경방 000050
시총: 298826443000 수익률: 0.9045643153526971 쿼리 시간: 0.25777411460876465 0.3830399513244629
-------- 10 케이씨피드 025880
시총: 32261605940 수익률: 1.0265957446808511 쿼리 시간: 0.33641815185546875 0.32203221321105957
-------- 11 경인양행 012610
시총:



시총: 38784885480 수익률: inf 쿼리 시간: 0.32590675354003906 0.33045506477355957
-------- 90 대양제지 006580
시총: 83235000000 수익률: 1.1567164179104477 쿼리 시간: 0.26797986030578613 0.6776020526885986
-------- 91 대우조선해양 042660
시총: 2406769132400 수익률: 0.9373695198329853 쿼리 시간: 0.8508248329162598 0.47080492973327637
-------- 92 대웅 003090
시총: 2191952646000 수익률: 1.6427015250544663 쿼리 시간: 0.28888797760009766 0.5267469882965088
-------- 93 대원 007680
시총: 102140570400 수익률: 0.978865406006674 쿼리 시간: 0.27939391136169434 0.11000609397888184
-------- 94 대원강업 000430
시총: 197780000000 수익률: 1.0777027027027026 쿼리 시간: 0.28836512565612793 0.42466115951538086
-------- 95 미래SCI 028040
관리종목
-------- 96 대원산업 005710
시총: 101991384000 수익률: 0.9980392156862745 쿼리 시간: 0.30490899085998535 0.5079159736633301
-------- 97 대원제약 003220
시총: 472688130050 수익률: 1.3342857142857143 쿼리 시간: 0.37839293479919434 0.36126208305358887
-------- 98 대원화성 024890
시총: 71361032960 수익률: 1.039039039039039 쿼리 시간: 0.33055806159973145 0.47948288917541504
-------- 9

In [46]:
len(df_company)

2277

In [48]:
df_company

Unnamed: 0,기업코드,IFRS,CFS,종목코드,회사명,유동자산,유동부채,자산총계,날짜,유동비율,주가,주식수,시가총액,start_price,end_price,yield
0,00100258,1.0,1.0,030270,에스마크,2.208000e+10,1.976000e+10,4.587000e+10,2020-09-06,1.117409,0.0,0.0,0.000000e+00,,,
1,00100601,1.0,1.0,114190,강원,3.054000e+10,3.084000e+10,5.660206e+10,2020-09-06,0.990272,0.0,0.0,0.000000e+00,,,
2,00100939,1.0,1.0,000860,강남제비스코,1.787900e+11,5.991000e+10,6.722703e+11,2020-09-06,2.984310,15450.0,6500000.0,1.004250e+11,14250.0,15450.0,1.084211
3,00101044,1.0,1.0,003060,에이프로젠제약,2.135500e+11,1.081000e+10,3.652830e+11,2020-09-06,19.754857,1545.0,398575766.0,6.157996e+11,1775.0,1545.0,0.870423
4,00101220,1.0,1.0,001390,KG케미칼,1.441670e+12,1.587760e+12,4.151542e+12,2020-09-06,0.907990,22800.0,13013232.0,2.967017e+11,13400.0,22800.0,1.701493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2272,01396931,1.0,0.0,337840,유엑스엔,2.000000e+09,6.000000e+08,2.900000e+09,2020-09-06,3.333333,0.0,0.0,0.000000e+00,,,
2273,01412725,1.0,0.0,336260,두산퓨얼셀,4.084000e+11,2.253000e+11,4.958000e+11,2020-09-06,1.812694,45650.0,55493726.0,2.533289e+12,29750.0,45650.0,1.534454
2274,01413371,1.0,0.0,343090,단디바이오,6.300000e+09,5.000000e+08,6.800000e+09,2020-09-06,12.600000,0.0,0.0,0.000000e+00,,,
2275,01418260,1.0,0.0,354230,폭스소프트,5.200000e+09,4.400000e+09,6.100000e+09,2020-09-06,1.181818,0.0,0.0,0.000000e+00,,,


In [51]:
df = df_company[df_company['주가'].notnull()]

In [52]:
len(df)

1956

In [57]:
df = df[df['start_price'] > 0]

In [58]:
len(df)

1950

In [59]:
# 시총 하위 20%
MAX_MARKET_CAP = 0.2

df = df.sort_values(by=['시가총액'])
df = df[:int(len(df)*MAX_MARKET_CAP)]

In [60]:
len(df)

390

In [62]:
# 유동비율 필터링
LIQUID_RATE = 1.5
df = df[df['유동비율'] > LIQUID_RATE]

In [63]:
len(df)

205

In [64]:
# 수익률을 기준으로 sorting
df = df.sort_values(by=['yield'], ascending=True)

In [67]:
df.to_csv("log/" + end_date + ".csv", index=False)

In [66]:
df[:45]

Unnamed: 0,기업코드,IFRS,CFS,종목코드,회사명,유동자산,유동부채,자산총계,날짜,유동비율,주가,주식수,시가총액,start_price,end_price,yield
21,103130,1.0,1.0,9810,엔케이물산,26640000000.0,820000000.0,42708080000.0,2020-09-06,32.487805,370.0,83578428.0,30924020000.0,573.0,370.0,0.645724
1904,189538,1.0,0.0,32580,피델릭스,60700000000.0,30700000000.0,68700000000.0,2020-09-06,1.977199,1275.0,30113422.0,38394610000.0,1935.0,1275.0,0.658915
1161,493431,1.0,1.0,221610,자안,24690000000.0,8640000000.0,34679010000.0,2020-09-06,2.857639,335.0,141278275.0,47328220000.0,447.0,335.0,0.749441
1692,1182240,1.0,1.0,267790,배럴,38450000000.0,4890000000.0,58348030000.0,2020-09-06,7.862986,7680.0,7885500.0,60560640000.0,10000.0,7680.0,0.768
695,232821,1.0,1.0,36170,라이브파이낸셜,54290000000.0,10700000000.0,102245100000.0,2020-09-06,5.073832,1220.0,45121437.0,55048150000.0,1540.0,1220.0,0.792208
794,269241,1.0,1.0,44380,주연테크,39830000000.0,11390000000.0,61757550000.0,2020-09-06,3.496927,1115.0,54612089.0,60892480000.0,1395.0,1115.0,0.799283
1539,977641,1.0,1.0,149980,하이로닉,29300000000.0,4210000000.0,46866860000.0,2020-09-06,6.95962,3810.0,14099995.0,53720980000.0,4640.0,3810.0,0.821121
2029,526678,1.0,0.0,99410,동방선기,14900000000.0,3400000000.0,34100000000.0,2020-09-06,4.382353,3135.0,13541002.0,42451040000.0,3790.0,3135.0,0.827177
775,264671,1.0,1.0,39310,세중,91250000000.0,28390000000.0,124802700000.0,2020-09-06,3.21416,2710.0,18121667.0,49109720000.0,3195.0,2710.0,0.8482
1405,763358,1.0,1.0,136510,쎄미시스코,18740000000.0,11470000000.0,37911980000.0,2020-09-06,1.633827,7820.0,5637679.0,44086650000.0,9140.0,7820.0,0.85558
