In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

## 1-1. 재무제표 페이지 요청하기

In [2]:
code = '005930' # 종목 코드
fin_type = '4' # 재무제표 타입 -> 0: 주재무제표, 1: GAAP개별, 2: GAAP연결, 3: IFRS별도, 4:IFRS연결
freq_typ = 'Y' # 기간 -> Y : 년, Q : 분기
url = """http://companyinfo.stock.naver.com/v1/company/ajax/cF1001.aspx?
            cmp_cd={0}&fin_typ={1}&freq_typ={2}""".format(code, fin_type, freq_typ)

req = requests.get(url).content

In [3]:
soup = BeautifulSoup(req, 'lxml')
thead = soup.select('thead tr')[1] # 재무제표 컬럼 명 : 첫번째 값은 필요없는 정보이기 때문에 2번째 요소만 가져오기
tbody = soup.select('tbody tr') # 재무제표 값

## 2. 불필요한 데이터 전처리하기 (thead)

In [4]:
# date의 표기법을 year/month에서 year-month로 변경하기
p = re.compile("\d{4}/\d{2}") # year/month 값 추출해내기
date_list = [ date.replace('/', '-') for date in p.findall(str(thead))] # date 값으로 변환가능하도록 부분 수정

In [5]:
date_list

['2012-12',
 '2013-12',
 '2014-12',
 '2015-12',
 '2016-12',
 '2017-12',
 '2018-12',
 '2019-12']

In [6]:
# date(2012~2019)와 종목코드로 기본 데이터 프레임 생성
df = pd.DataFrame({'date' : date_list, 'code': code}, columns=['date', 'code', 'estimated']) # date 값과, code값을 컬럼으로 df 생성
df['date'] = pd.to_datetime(df['date']) # date를 datetime 형식으로 변경

## 3. 불필요한 데이터 전처리하기 (tbody) 

In [7]:
# 컬럼 값 중 숫자 데이터에 천단위 표기 ',' 제거하기
def clean_value(value): 
    if ',' in value:
        value = value.replace(',','')
    
    return value

In [8]:
for tr in tbody:
    th = tr.find('th').text # th -> 재무제표 정보 이름을 담고 있는 태그
    td_list = [ clean_value(td.text) for td in tr.find_all('td') ] # td -> 값을 담고 있는 태그로 천단위 표시 기호인 ','를 제거함 
    df[th] = pd.to_numeric(td_list) # 재무제표 정보와 값을 df 컬럼으로 추가

In [9]:
df # 결과 값

Unnamed: 0,date,code,estimated,매출액,영업이익,영업이익(발표기준),세전계속사업이익,당기순이익,당기순이익(지배),당기순이익(비지배),...,부채비율,자본유보율,EPS(원),PER(배),BPS(원),PBR(배),현금DPS(원),현금배당수익률,현금배당성향(%),발행주식수(보통주)
0,2012-12-01,5930,,2011036,290493,290493.0,299150,238453,231854,6599.0,...,49.05,13859.35,136278,11.17,776993,1.96,8000,0.53,5.2,147299337.0
1,2013-12-01,5930,,2286927,367850,367850.0,383643,304748,298212,6535.0,...,42.7,17047.55,175282,7.83,958040,1.43,14300,1.04,7.23,147299337.0
2,2014-12-01,5930,,2062060,250251,250251.0,278750,233944,230825,3119.0,...,37.09,19379.47,135673,9.78,1083205,1.23,20000,1.51,13.0,147299337.0
3,2015-12-01,5930,,2006535,264134,264134.0,259610,190601,186946,3655.0,...,35.25,21117.88,109883,11.47,1185738,1.06,21000,1.67,16.42,147299337.0
4,2016-12-01,5930,,2018667,292407,292407.0,307137,227261,224157,3104.0,...,35.87,22004.14,136760,13.18,1331779,1.35,28500,1.58,17.81,140679337.0
5,2017-12-01,5930,,2404376,543946,,556823,420223,412331,,...,35.48,,270330,9.64,1589651,1.64,34628,1.33,0.11,
6,2018-12-01,5930,,2669708,649844,,664192,499014,489611,,...,31.24,,332279,7.64,1887031,1.35,66961,2.64,0.18,
7,2019-12-01,5930,,2813213,668595,,685824,515132,506778,,...,28.24,,343930,7.38,2169842,1.17,70450,2.77,0.18,


In [33]:
for i, v in enumerate(df['영업이익(발표기준)']):
    if np.isnan(v) == True:
        df['estimated'][i] = 1
    else: 
        df['estimated'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [34]:
df

Unnamed: 0,date,code,estimated,매출액,영업이익,영업이익(발표기준),세전계속사업이익,당기순이익,당기순이익(지배),당기순이익(비지배),...,부채비율,자본유보율,EPS(원),PER(배),BPS(원),PBR(배),현금DPS(원),현금배당수익률,현금배당성향(%),발행주식수(보통주)
0,2012-12-01,5930,0,2011036,290493,290493.0,299150,238453,231854,6599.0,...,49.05,13859.35,136278,11.17,776993,1.96,8000,0.53,5.2,147299337.0
1,2013-12-01,5930,0,2286927,367850,367850.0,383643,304748,298212,6535.0,...,42.7,17047.55,175282,7.83,958040,1.43,14300,1.04,7.23,147299337.0
2,2014-12-01,5930,0,2062060,250251,250251.0,278750,233944,230825,3119.0,...,37.09,19379.47,135673,9.78,1083205,1.23,20000,1.51,13.0,147299337.0
3,2015-12-01,5930,0,2006535,264134,264134.0,259610,190601,186946,3655.0,...,35.25,21117.88,109883,11.47,1185738,1.06,21000,1.67,16.42,147299337.0
4,2016-12-01,5930,0,2018667,292407,292407.0,307137,227261,224157,3104.0,...,35.87,22004.14,136760,13.18,1331779,1.35,28500,1.58,17.81,140679337.0
5,2017-12-01,5930,1,2404376,543946,,556823,420223,412331,,...,35.48,,270330,9.64,1589651,1.64,34628,1.33,0.11,
6,2018-12-01,5930,1,2669708,649844,,664192,499014,489611,,...,31.24,,332279,7.64,1887031,1.35,66961,2.64,0.18,
7,2019-12-01,5930,1,2813213,668595,,685824,515132,506778,,...,28.24,,343930,7.38,2169842,1.17,70450,2.77,0.18,
