In [1]:
import time
import sqlite3
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import FinanceDataReader as fdr
from datetime import date, timedelta, datetime

In [2]:
end_date_dt = date.today()
start_date_dt = end_date_dt - timedelta(days=60)
end_date = end_date_dt.strftime("%Y-%m-%d")
start_date = start_date_dt.strftime("%Y-%m-%d")

start_date, end_date

('2020-07-09', '2020-09-07')

In [3]:
# query financial statement data
conn = sqlite3.connect('../../data/data_v3.1.db')
cur = conn.cursor()

if end_date_dt.month > 5:
    sql = "select * from fs where year=" + str(int(end_date[:4]) - 1) # 작년도의 재무상태표, 손익계산서 데이터를 가져온다. 
else:
    sql = "select * from fs where year=" + str(int(end_date[:4]) - 2) # 재작년도의 재무상태표, 손익계산서 데이터를 가져온다. 
    
cur.execute(sql)
rows = cur.fetchall()
print("Querried rows:", len(rows))

# insert querried data into dataframe
df_company = pd.DataFrame(columns=['날짜', '기업코드', '종목코드', '회사명', '유동자산', '유동부채', '자산총계'])

companies = []
for row in rows:
    company = {}
    company['날짜'] = end_date
    company['기업코드'] = row[1] # corp code
    company['종목코드'] = row[2] # stock code
    company['회사명'] = row[3] # corp name
    company[row[4]] = row[5] # amount
    company['IFRS'] = row[6] # IFRS
    company['CFS'] = row[7] # CFS
    
    companies.append(company)
df_company = df_company.append(companies)

Querried rows: 47709


In [4]:
aggregation_functions = {'종목코드': 'first', '회사명': 'first', '유동자산': 'sum', '유동부채': 'sum', '자산총계': 'sum', '날짜': 'first'}

In [5]:
# 연결 재무제표 기업
df_cfs = df_company[df_company.CFS == 1]
df_cfs = df_cfs.groupby(['기업코드', 'IFRS', 'CFS']).aggregate(aggregation_functions).reset_index()
df_cfs = df_cfs[df_cfs['자산총계'] > 0]
len(df_cfs)

1755

In [6]:
# 개별 재무제표 기업
df_fs = df_company[df_company.CFS == 0]
df_fs = df_fs.groupby(['기업코드', 'IFRS', 'CFS']).aggregate(aggregation_functions).reset_index()
df_fs = df_fs[df_fs['자산총계'] > 0]
len(df_fs)

2252

In [7]:
# CFS가 있는 기업은 FS에서 삭제
cfs_stock_codes = df_cfs['종목코드'].tolist()
df_fs['CFS_EXISTS'] = df_fs.apply(lambda x: x['종목코드'] in cfs_stock_codes, axis=1)
df_fs = df_fs[df_fs['CFS_EXISTS'] == False]
df_fs = df_fs.drop(columns=['CFS_EXISTS'])

# 합체
df_company = pd.concat([df_cfs, df_fs]).reset_index().drop(columns=['index'])
print("Number of companies:", len(df_company))

Number of companies: 2277


In [8]:
df_company['유동비율'] = df_company['유동자산'] / df_company['유동부채']

In [9]:
df_company

Unnamed: 0,기업코드,IFRS,CFS,종목코드,회사명,유동자산,유동부채,자산총계,날짜,유동비율
0,00100258,1.0,1.0,030270,에스마크,2.208000e+10,1.976000e+10,4.587000e+10,2020-09-07,1.117409
1,00100601,1.0,1.0,114190,강원,3.054000e+10,3.084000e+10,5.660206e+10,2020-09-07,0.990272
2,00100939,1.0,1.0,000860,강남제비스코,1.787900e+11,5.991000e+10,6.722703e+11,2020-09-07,2.984310
3,00101044,1.0,1.0,003060,에이프로젠제약,2.135500e+11,1.081000e+10,3.652830e+11,2020-09-07,19.754857
4,00101220,1.0,1.0,001390,KG케미칼,1.441670e+12,1.587760e+12,4.151542e+12,2020-09-07,0.907990
...,...,...,...,...,...,...,...,...,...,...
2272,01396931,1.0,0.0,337840,유엑스엔,2.000000e+09,6.000000e+08,2.900000e+09,2020-09-07,3.333333
2273,01412725,1.0,0.0,336260,두산퓨얼셀,4.084000e+11,2.253000e+11,4.958000e+11,2020-09-07,1.812694
2274,01413371,1.0,0.0,343090,단디바이오,6.300000e+09,5.000000e+08,6.800000e+09,2020-09-07,12.600000
2275,01418260,1.0,0.0,354230,폭스소프트,5.200000e+09,4.400000e+09,6.100000e+09,2020-09-07,1.181818


In [10]:
def get_price_and_stocks(stock_code):
    url = "https://finance.naver.com/item/main.nhn?code=" + stock_code
    response = requests.get(url)
    
    retry = 0
    while response.status_code != 200:
        print(response.status_code, "대기...")
        time.sleep(10)
        response = requests.get(url)
        
        retry += 1
        if retry > 12:
            print("반복 요청 제한")
            return np.NaN, np.NaN

    html = BeautifulSoup(response.text, 'html.parser')
    
    # 종목 페이지가 네이버 금융에 있는지 확인
    error_desc = html.find("p", {"class": "error_desc"})
    if error_desc:
        print("종목 정보 없음")
        return np.NaN, np.NaN

    # 코스피, 코스닥, 코넥스 확인
    description = html.find("div", {"class": "description"})
    market_img = description.find_all("img")[0]
    if market_img['alt'] not in ['코스닥', '코스피']:
        print("코스닥, 코스피 종목 아님:", market_img['alt'])
        return np.NaN, np.NaN
    
    # 관리종목 여부 확인
    spans = description.find_all("span")
    for span in spans:
        if span.text == '관리종목':
            print("관리종목")
            return np.NaN, np.NaN

    # 주식 수
    aside = html.find("div", {"id": "aside"})
    table = aside.find("div", {"class": "first"})
    trs = table.find_all("tr")
    
    num_stocks = 0
    for tr in trs:
        if tr.find("th").text == '상장주식수':
            num_stocks = tr.find("td").text
            num_stocks = int(num_stocks.replace(",",""))

    # 가격
    content = html.find("div", {"id": "content"})
    div_today = content.find("div", {"class": "today"})
    price = div_today.find("span").text
    price = int(price.replace(",",""))

    return price, num_stocks

In [11]:
#price, num_stocks = get_price_and_stocks("088260")

In [12]:
# 과거 60일 가격 정보로 모멘텀 데이터 채우기
df_company['주가'] = np.NaN
df_company['주식수'] = np.NaN
df_company['시가총액'] = np.NaN
df_company['start_price'] = np.NaN
df_company['end_price'] = np.NaN
df_company['yield'] = np.NaN

In [28]:
start_time = time.time()
for i, row in enumerate(df_company.iterrows()):
    if i < 89:
        continue
    
    company = row[1] # company = (corp code, ifrs, cfs, stock code, corp name, ...)
    stock_code = company[3]
    
    print("--------", i, company[4], stock_code)
    
    # 시가총액
    start_price_time = time.time()
    price, num_stocks = get_price_and_stocks(stock_code)
    market_cap_time = time.time() - start_price_time
    
    market_cap = price * num_stocks
    
    df_company.at[row[0], '주가'] = price
    df_company.at[row[0], '주식수'] = num_stocks
    df_company.at[row[0], '시가총액'] = market_cap
    
    if np.isnan(price): # 네이버 금융에 가격정보가 없거나 현재 관리종목인 경우
        continue
    else:
        start_price_time = time.time()
        df_price = fdr.DataReader(stock_code, start_date, end_date).reset_index()
        price_time = time.time() - start_price_time
        
        if len(df_price) == 0:
            print("no price FDR")
            continue
            
        # 마지막으로 거래된 날짜 확인
        # volume이 0보다 큰 날짜들 필터링
        df_price = df_price[df_price['Volume'] > 0]
        if len(df_price) == 0:
            print("거래 정지 중")
            continue
        
        df_price_end_date = df_price.iloc[len(df_price) - 1].Date
        df_price_end_date = datetime.strptime(str(df_price_end_date)[:10], '%Y-%m-%d').date()

        if (end_date_dt - df_price_end_date).days >= 7: # 최근 일주일 내 거래되지 않았으면 제거
            print("거래 정지 중 / 마지막 거래일:", df_price_end_date)
            continue

        start_price = df_price.iloc[0].Open
        end_price = df_price.iloc[len(df_price) - 1].Close
        stock_yield = end_price / start_price 
        
        df_company.at[row[0], 'start_price'] = start_price
        df_company.at[row[0], 'end_price'] = end_price
        df_company.at[row[0], 'yield'] = stock_yield
        
        print("시총:", market_cap, "수익률:", stock_yield, "쿼리 시간:", market_cap_time, price_time)
        
        time.sleep(0.5)
        
print(time.time() - start_time)

-------- 89 KD 044180
500 대기...
500 대기...
거래 정지 중
-------- 90 대양제지 006580
시총: 84711750000 수익률: 1.1928166351606806 쿼리 시간: 0.302001953125 0.3841080665588379
-------- 91 대우조선해양 042660
시총: 2508614596800 수익률: 0.9915254237288136 쿼리 시간: 0.2996969223022461 0.2911701202392578
-------- 92 대웅 003090
시총: 2206488141000 수익률: 1.6644736842105263 쿼리 시간: 0.23290324211120605 0.266021728515625
-------- 93 대원 007680
시총: 103533396360 수익률: 0.9900110987791343 쿼리 시간: 0.27022314071655273 0.09795498847961426
-------- 94 대원강업 000430
시총: 203360000000 수익률: 1.087893864013267 쿼리 시간: 0.36826014518737793 0.3436319828033447
-------- 95 미래SCI 028040
관리종목
-------- 96 대원산업 005710
시총: 102592512000 수익률: 0.9961089494163424 쿼리 시간: 0.23495006561279297 0.421314001083374
-------- 97 대원제약 003220
시총: 471675949900 수익률: 1.3238636363636365 쿼리 시간: 0.27916693687438965 0.41163206100463867
-------- 98 대원화성 024890
시총: 71361032960 수익률: 0.9885714285714285 쿼리 시간: 0.26676511764526367 0.29749512672424316
-------- 99 디아이씨 092200
시총: 67082781525 

In [25]:
df_company

Unnamed: 0,기업코드,IFRS,CFS,종목코드,회사명,유동자산,유동부채,자산총계,날짜,유동비율,주가,주식수,시가총액,start_price,end_price,yield
0,00100258,1.0,1.0,030270,에스마크,2.208000e+10,1.976000e+10,4.587000e+10,2020-09-07,1.117409,,,,,,
1,00100601,1.0,1.0,114190,강원,3.054000e+10,3.084000e+10,5.660206e+10,2020-09-07,0.990272,,,,,,
2,00100939,1.0,1.0,000860,강남제비스코,1.787900e+11,5.991000e+10,6.722703e+11,2020-09-07,2.984310,16100.0,6500000.0,1.046500e+11,14400.0,16100.0,1.118056
3,00101044,1.0,1.0,003060,에이프로젠제약,2.135500e+11,1.081000e+10,3.652830e+11,2020-09-07,19.754857,1540.0,398575766.0,6.138067e+11,1765.0,1540.0,0.872521
4,00101220,1.0,1.0,001390,KG케미칼,1.441670e+12,1.587760e+12,4.151542e+12,2020-09-07,0.907990,23100.0,13013232.0,3.006057e+11,13250.0,23100.0,1.743396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2272,01396931,1.0,0.0,337840,유엑스엔,2.000000e+09,6.000000e+08,2.900000e+09,2020-09-07,3.333333,,,,,,
2273,01412725,1.0,0.0,336260,두산퓨얼셀,4.084000e+11,2.253000e+11,4.958000e+11,2020-09-07,1.812694,,,,,,
2274,01413371,1.0,0.0,343090,단디바이오,6.300000e+09,5.000000e+08,6.800000e+09,2020-09-07,12.600000,,,,,,
2275,01418260,1.0,0.0,354230,폭스소프트,5.200000e+09,4.400000e+09,6.100000e+09,2020-09-07,1.181818,,,,,,


In [29]:
len(df_company)

2277

In [30]:
df = df_company[df_company['주가'].notnull()]

In [31]:
len(df)

1956

In [33]:
df = df[df['start_price'] > 0]

In [34]:
len(df)

1948

In [35]:
# 시총 하위 20%
MAX_MARKET_CAP = 0.2

df = df.sort_values(by=['시가총액'])
df = df[:int(len(df)*MAX_MARKET_CAP)]

In [36]:
len(df)

389

In [37]:
# 유동비율 필터링
LIQUID_RATE = 1.5
df = df[df['유동비율'] > LIQUID_RATE]

In [38]:
len(df)

206

In [39]:
# 수익률을 기준으로 sorting
df = df.sort_values(by=['yield'], ascending=True)

In [40]:
df.to_csv("log/" + end_date + ".csv", index=False)

In [41]:
df[:45]

Unnamed: 0,기업코드,IFRS,CFS,종목코드,회사명,유동자산,유동부채,자산총계,날짜,유동비율,주가,주식수,시가총액,start_price,end_price,yield
21,103130,1.0,1.0,9810,엔케이물산,26640000000.0,820000000.0,42708080000.0,2020-09-07,32.487805,362.0,83578428.0,30255390000.0,569.0,362.0,0.636204
1904,189538,1.0,0.0,32580,피델릭스,60700000000.0,30700000000.0,68700000000.0,2020-09-07,1.977199,1360.0,30113422.0,40954250000.0,1910.0,1360.0,0.712042
1161,493431,1.0,1.0,221610,자안,24690000000.0,8640000000.0,34679010000.0,2020-09-07,2.857639,337.0,141278275.0,47610780000.0,439.0,337.0,0.767654
1692,1182240,1.0,1.0,267790,배럴,38450000000.0,4890000000.0,58348030000.0,2020-09-07,7.862986,7780.0,7885500.0,61349190000.0,10000.0,7780.0,0.778
1539,977641,1.0,1.0,149980,하이로닉,29300000000.0,4210000000.0,46866860000.0,2020-09-07,6.95962,3850.0,14099995.0,54284980000.0,4700.0,3850.0,0.819149
2029,526678,1.0,0.0,99410,동방선기,14900000000.0,3400000000.0,34100000000.0,2020-09-07,4.382353,3120.0,13541002.0,42247930000.0,3730.0,3120.0,0.836461
1031,405719,1.0,1.0,52460,아이크래프트,45470000000.0,20650000000.0,53449840000.0,2020-09-07,2.201937,4015.0,14607936.0,58650860000.0,4785.0,4015.0,0.83908
794,269241,1.0,1.0,44380,주연테크,39830000000.0,11390000000.0,61757550000.0,2020-09-07,3.496927,1135.0,54612089.0,61984720000.0,1350.0,1135.0,0.840741
563,171867,1.0,1.0,42110,에스씨디,95560000000.0,30180000000.0,139154000000.0,2020-09-07,3.166335,1060.0,48329564.0,51229340000.0,1260.0,1060.0,0.84127
695,232821,1.0,1.0,36170,라이브파이낸셜,54290000000.0,10700000000.0,102245100000.0,2020-09-07,5.073832,1295.0,45121437.0,58432260000.0,1530.0,1295.0,0.846405


In [20]:
#df_price = fdr.DataReader('005930', start_date, end_date).reset_index()