In [None]:
!pip install dart-fss

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import os
import numpy as np
import dart_fss as dart
import requests
from bs4 import BeautifulSoup


#PATH = "/Users/jeeho/Desktop/"
PATH = "/content/drive/MyDrive/miraeasset/"
df= pd.read_excel(PATH+"train.xlsx")
df.rename(columns = {'종목코드':"CODE", '종목명':"CO_NM"},inplace=True)
df['CODE'] = df['CODE'].apply(lambda x : str(x).zfill(6))

#print(df)

# Open DART API KEY 설정
api_key='f500347222aa08270de912e213facf6b8e2cb58c'
dart.set_api_key(api_key=api_key)

# DART 에 공시된 회사 리스트 불러오기
corp_list = dart.get_corp_list()


def check_missing_corp(df):
    for stock_code in df["CODE"]:
      if not corp_list.find_by_stock_code(stock_code):
        print(stock_code,df["CO_NM"][df["CODE"]==stock_code])


def corp_info(stock_code, bsns_year):     # api 호출할때마다 필요한 정보(stock_code, bsns_year, reprt_code)를 반환
  if corp_list.find_by_stock_code(stock_code):
    corp_code = corp_list.find_by_stock_code(stock_code).corp_code
    reprt_code = 11011 # 사업보고서를 의미하는 보고서 코드
    return str(corp_code) , str(bsns_year), str(reprt_code)
  else :
    return np.NaN


def clean(str):                   # 보고서마다 표현 통일
  str = str.replace(',','')
  str = str.replace(' ','')
  str = str.replace('년','.')
  str = str.replace('개월','')
  str = str.replace('\n','\0')
  return 0 if str in '-.' else float(str)


def fill_features(stock_code, corp_code, bsns_year, reprt_code):
    try:
        #### 배당에 관한 사항 ####
        alot_matter = dart.api.info.alot_matter(corp_code, bsns_year, reprt_code, api_key=None)

        #### '(연결)현금배당성향(%)' ####
        thstrm = alot_matter['list'][6]['thstrm'] # 당기
        frmtrm = alot_matter['list'][6]['frmtrm'] # 전기
        thstrm,frmtrm = list(map(clean, [thstrm,frmtrm]))
        #### 현금배당수익률(%) ####
        payout_ratio_1_thstrm = alot_matter['list'][7]['thstrm'] # 당기
        payout_ratio_1_thstrm = clean(payout_ratio_1_thstrm)

    except:
        thstrm = np.NaN
        frmtrm = np.NaN
        payout_ratio_1_thstrm = np.NaN

    try:
        #### 대주주 지분율(%) ####
        hyslr=dart.api.info.hyslr_sttus(corp_code, bsns_year, reprt_code, api_key=None)
        result = [hyslr['list'][-1]['trmend_posesn_stock_qota_rt'], hyslr['list'][-2]['trmend_posesn_stock_qota_rt']] # result = [보통주, 우선주]
        major_hold_stock_ratio = sum(list(map(clean, result)))   # 보고서 상에 결측치 '-' 로 표시된 부분 처리
    except:
        major_hold_stock_ratio = np.NaN

    try:
        #### 소액주주 지분율(%) ####
        mrhl = dart.api.info.mrhl_sttus(corp_code, bsns_year, reprt_code, api_key=None)
        minor_hold_stock_ratio = list(map(clean, mrhl['list'][0]['hold_stock_rate'][:-1]))[0]
    except:
        minor_hold_stock_ratio = np.NaN

    try:
        #### 사외이사 비율(%) ####
        outcmpny_drctr= dart.api.info.outcmpny_drctr_nd_change_sttus(corp_code, bsns_year, reprt_code, api_key=None)
        otcmp_drctr_co, drctr_co = outcmpny_drctr['list'][0]['otcmp_drctr_co'],outcmpny_drctr['list'][0]['drctr_co']
        otcmp_drctr_co, drctr_co = list(map(clean, [otcmp_drctr_co, drctr_co]))
        otcmp_drctr_ratio = otcmp_drctr_co/drctr_co
    except:
        otcmp_drctr_ratio = np.NaN

    try:
        #### 임원 성비 ####
        exctv=dart.api.info.exctv_sttus(corp_code, bsns_year, reprt_code, api_key=None)
        if len(exctv['list'])==0:
            exctv_ratio = np.NaN
        else :
            exctv_female = 0
        for i in range(len(exctv['list'])):
            if exctv['list'][i]['sexdstn']=='여':
                exctv_female+=1
        exctv_ratio = exctv_female/len(exctv['list'])
    except:
        exctv_ratio = np.NaN

    try:
        #### 직원 ####
        emp=dart.api.info.emp_sttus(corp_code, bsns_year, reprt_code, api_key=None)

        if len(emp['list'])==0:
            emp_ratio = np.NaN
        else:
            rgllbr_co=0
            sm=0
            avrg_cnwk_sdytrn = 0
            jan_salary=0
            jan_salary_female=0
            emp_female = 0
            sector_female =0

        for sector in emp['list']:
            rgllbr_co += clean(sector['rgllbr_co']) # 정규직 직원 수
            sm += clean(sector['sm']) # 총직원수
            avrg_cnwk_sdytrn += clean(sector['avrg_cnwk_sdytrn']) # 평균 근속 연수
            jan_salary += clean(sector['jan_salary_am']) # 1인 평균 급여액
            if sector['sexdstn']=='여':
                sector_female+=1
                emp_female += clean(sector['sm'])
                jan_salary_female += clean(sector['jan_salary_am'])

        emp_ratio = emp_female/sm
        rglbr_ratio = rgllbr_co/sm

    except:
        exctv_ratio = np.NaN

    try:
        ##### 타법인 출자 현황 ####
        otr_cpr_invstmnt_sttus = dart.api.info.otr_cpr_invstmnt_sttus(corp_code, bsns_year, reprt_code, api_key=None)
        incrs_dcrs_evl_lstmn = clean(otr_cpr_invstmnt_sttus['list'][-1]['incrs_dcrs_evl_lstmn']) # 증가 감소 평가 손액
        bsis_blce_acntbk_amount = clean(otr_cpr_invstmnt_sttus['list'][-1]['bsis_blce_acntbk_amount'])  # 기초 잔액 장부 가액
        trmend_blce_acntbk_amount = clean(otr_cpr_invstmnt_sttus['list'][-1]['trmend_blce_acntbk_amount']) # 기말 잔액 장부 가액
    except:
        incrs_dcrs_evl_lstmn = np.NaN
        bsis_blce_acntbk_amount = np.NaN
        trmend_blce_acntbk_amount = np.NaN
        avrg_cnwk_sdytrn= np.NaN
        jan_salary=np.NaN
        jan_salary_female=np.NaN
        
        
    try:
        ## 기업연령
        get_corp_info = dart.api.filings.get_corp_info(corp_code)
        age = int(bsns_year)-int(get_corp_info['est_dt'][:4])
        ## 신용등급
    except:
        age = np.NaN


    try:
        per_selector = "#_per"
        pbr_selector = "#_pbr"
        dividend_yield_selector = "#_dvr"
        url = "https://finance.naver.com/item/main.naver?code="+stock_code
        html = requests.get(url).text
        soup = BeautifulSoup(html, "html5lib")
        per = soup.select(per_selector)
        pbr = soup.select(pbr_selector)
        dividend_yield = soup.select(dividend_yield_selector)


        fin_per = per[0].text if per else 0
        fin_pbr = pbr[0].text if pbr else 0
        fin_dvr = dividend_yield[0].text if dividend_yield else 0

        
    except:    
        fin_per = 0
        fin_pbr = 0
        fin_dvr = 0
        
        
    try:
        idx=list(df["CODE"]==stock_code).index(True)

        df["현금배당성향_당기"][idx]= thstrm/100
        df["현금배당성향_변화"][idx]= (thstrm-frmtrm)/100
        df["현금배당수익률"][idx]= payout_ratio_1_thstrm/100
        df["대주주지분율"][idx]= major_hold_stock_ratio/100
        df["소액주주지분율"][idx]= minor_hold_stock_ratio/100
        df["사외이사비율"][idx]= otcmp_drctr_ratio
        df["임원성비"][idx]= exctv_ratio
        df["직원수"][idx]= sm
        df["직원성비"][idx]= emp_ratio
        df["정규직비율"][idx]= rglbr_ratio
        df["평균근속연수"][idx]= avrg_cnwk_sdytrn/len(emp['list'])
        df["1인평균급여액"][idx]= jan_salary/len(emp['list'])
        df["여성평균급여액"][idx]= jan_salary_female/sector_female
        df["증가감소평가손액"][idx]=incrs_dcrs_evl_lstmn
        df["기초잔액장부가액"][idx]=bsis_blce_acntbk_amount
        df["기말잔액장부가액"][idx]=trmend_blce_acntbk_amount
        df["기업연령"][idx]=age
        df["PER"][idx]=fin_per
        df["PBR"][idx]=fin_pbr
        df["DVR"][idx]=fin_dvr

    except:
        pass

    return df


Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/fake_useragent/utils.py", line 154, in load
    for item in get_browsers(verify_ssl=verify_ssl):
  File "/usr/local/lib/python3.7/dist-packages/fake_useragent/utils.py", line 99, in get_browsers
    html = html.split('<table class="w3-table-all notranslate">')[1]
IndexError: list index out of range


Output()

Output()

Output()

In [6]:
df[["현금배당성향_당기",
   "현금배당성향_변화",
   "현금배당수익률",
   "대주주지분율",
   "소액주주지분율",
   "사외이사비율",
   "임원성비",
   "직원수",
   "직원성비",
   "정규직비율",
   "평균근속연수",
   "1인평균급여액",
   "여성평균급여액",
   "증가감소평가손액",
   "기초잔액장부가액",
   "기말잔액장부가액",
   "기업연령",
   "PER",
   "PBR",
   "DVR"]]= np.NaN

In [7]:
for i in df["CODE"]:
  try:
    corp_code, bsns_year, reprt_code = corp_info(i, 2021)
    df = fill_features(i,corp_code, bsns_year, reprt_code)
    
  except:
    pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [8]:
df.to_csv("filled.csv", mode='w')
SAVE_PATH = "/content/drive/MyDrive/miraeasset/0908/"
df.to_csv(SAVE_PATH + "filled.csv", index = False)