# ```esgRating.ipynb```
- 데이터를 수집, 전처리하여 ```esgRating.csv```를 만든 파일
- 데이터
  - KCGS_ESGRating
    - 2011~2018년 ESG등급
  - Data Collection Method
    - Web Scraping
  - Data Source
    - [KRX 정보데이터시스템](https://data.krx.co.kr/contents/MDC/HARD/hardController/MDCHARD050.cmd#none)
- ESG Rating
  - 등급 산출 시기 관련
    - 2023년 1월 말에 2023년 ESG등급을 확인할 수 있으며,
    - 2022년의 경우 2022년 11월에 조정된 것을 확인할 수 있었다.
      ![2023-02-01, 한국ESG기준원 등급 조회, 비고.png](../fig/md/2023-02-01%2C%20%ED%95%9C%EA%B5%ADESG%EA%B8%B0%EC%A4%80%EC%9B%90%20%EB%93%B1%EA%B8%89%20%EC%A1%B0%ED%9A%8C%2C%20%EB%B9%84%EA%B3%A0.png)

# import

In [84]:
import sys
import time
import warnings
import urllib.request
from glob import glob

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

import FinanceDataReader as fdr
from tqdm import tqdm

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

sys.path.append("../import")
import module as m

data_path = m.data_path
fp_esg = f"""{m.fp["esgRating"]}"""

In [85]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'}

df_list = []
for year in tqdm(range(2011, 2018 + 1)):
    data = {'bld': 'dbms/MDC/HARD/MDCHARD05001',
    'locale': 'ko_KR',
    'selTp': '1','grdYy': year}
    response = requests.post('https://data.krx.co.kr/comm/bldAttendant/getJsonData.cmd', headers = headers, data = data)
    df_list.append(pd.DataFrame(response.json()['block1']))

df_esg = pd.concat(df_list).drop(columns = 'RN').reset_index(drop = True)
df_esg.sample(10)

100%|██████████| 8/8 [00:00<00:00,  9.19it/s]


Unnamed: 0,ISU_NM,GRD1,GRD2,GRD3,GRD4,YY
2142,SG세계물산,B이하,C,B이하,B이하,2014
638,태양금속공업,B이하,B,B이하,B이하,2011
3009,만호제강,B이하,B,B이하,B이하,2015
1239,동원시스템즈,B이하,C,B이하,B이하,2012
4910,선도전기,B이하,B,B이하,B이하,2018
1999,유니온스틸,B이하,B,B이하,B이하,2013
1574,동국제강,B+,B+,B이하,B+,2013
1815,미래산업,B이하,B,B이하,B이하,2013
3186,휠라코리아,B이하,B,B이하,B이하,2015
3962,현대약품,B이하,B,B+,B이하,2016


## 전처리

In [86]:
df_esg.columns = ['종목명','ESG종합','E','S','G','연']
df_esg["연"] = df_esg["연"].astype("uint")
df_esg = df_esg.sort_values(by="연", ascending=False) 
df_esg = df_esg.reset_index(drop=True)
df_esg

Unnamed: 0,종목명,ESG종합,E,S,G,연
0,서울도시가스,B이하,B,B이하,B이하,2018
1,티비에이치글로벌,B이하,B,B이하,B이하,2018
2,평화산업,B이하,B,B이하,B+,2018
3,페이퍼코리아,B이하,C,B이하,B이하,2018
4,퍼시스,B이하,B,B이하,B이하,2018
...,...,...,...,...,...,...
5649,성신양회,B이하,B+,B이하,B이하,2011
5650,성보화학,,B,,,2011
5651,성문전자,B이하,C,B이하,B이하,2011
5652,선창산업,B이하,B,B이하,B이하,2011


## 종목코드
- FinanceDataReader로 KRX 전체 종목 가져오기

In [87]:
df_krx = fdr.StockListing("KRX")
df_krx = df_krx[['Name','Code']]
df_krx.columns = ['종목명','종목코드']
df_krx.head()

Unnamed: 0,종목명,종목코드
0,삼성전자,5930
1,LG에너지솔루션,373220
2,SK하이닉스,660
3,삼성바이오로직스,207940
4,삼성SDI,6400


In [88]:
df_esg = df_esg.merge(df_krx, on="종목명", how="left")
df_esg = df_esg[["종목코드", "종목명", "연", 'ESG종합','E','S','G']]
df_esg

Unnamed: 0,종목코드,종목명,연,ESG종합,E,S,G
0,,서울도시가스,2018,B이하,B,B이하,B이하
1,,티비에이치글로벌,2018,B이하,B,B이하,B이하
2,090080,평화산업,2018,B이하,B,B이하,B+
3,001020,페이퍼코리아,2018,B이하,C,B이하,B이하
4,016800,퍼시스,2018,B이하,B,B이하,B이하
...,...,...,...,...,...,...,...
5649,004980,성신양회,2011,B이하,B+,B이하,B이하
5650,003080,성보화학,2011,,B,,
5651,014910,성문전자,2011,B이하,C,B이하,B이하
5652,,선창산업,2011,B이하,B,B이하,B이하


## (선택) 비상장종목
- 비상장종목의 경우 종목코드가 NaN임.
- 나중에 KOSPI200 데이터와 합칠 때 NaN 값이 자연스럽게 삭제될 것이라 생각하여
- 기업코드의 NaN 값을 그대로 두기로 함!

In [89]:
df_esg[df_esg['종목코드'].isnull()]

Unnamed: 0,종목코드,종목명,연,ESG종합,E,S,G
0,,서울도시가스,2018,B이하,B,B이하,B이하
1,,티비에이치글로벌,2018,B이하,B,B이하,B이하
17,,태양금속공업,2018,B이하,C,B이하,B+
20,,태경화학,2018,B이하,C,B이하,B이하
23,,포스코,2018,A,A+,A,A
...,...,...,...,...,...,...,...
5630,,삼호,2011,B이하,B,B이하,B이하
5639,,세원셀론텍,2011,B이하,B,B이하,
5643,,세아베스틸,2011,B이하,B,B이하,B이하
5645,,선진지주,2011,B이하,C,B이하,


In [90]:
# df_esg_dn = df_esg.dropna(axis=0)
# df_esg_dn.info()
# df_esg_dn

## (선택) 등급에 '이하' 통합
- (예시) 'B+ 이하'는 'B+'로 변경하여 통합함

In [91]:
def esgRating(x):
    try:
        dict_enc = {
            "S": "S",
            "A+": "A+",
            "A": "A",
            "B+": "B+",
            "B+ 이하": "B+",
            "B": "B",
            "B이하": "B",
            "C": "C",
            "D": "D",
            "-": np.nan,
            "": np.nan,
            "NaN": np.nan,
            "None": np.nan,
        }
        return dict_enc[x]
    except:
        return np.nan


col_esg = ["ESG종합", "E", "S", "G"]
for i in col_esg:
    df_esg[i] = df_esg[i].fillna(np.nan).apply(esgRating).astype("category")

df_esg

Unnamed: 0,종목코드,종목명,연,ESG종합,E,S,G
0,,서울도시가스,2018,B,B,B,B
1,,티비에이치글로벌,2018,B,B,B,B
2,090080,평화산업,2018,B,B,B,B+
3,001020,페이퍼코리아,2018,B,C,B,B
4,016800,퍼시스,2018,B,B,B,B
...,...,...,...,...,...,...,...
5649,004980,성신양회,2011,B,B+,B,B
5650,003080,성보화학,2011,,B,,
5651,014910,성문전자,2011,B,C,B,B
5652,,선창산업,2011,B,B,B,B


## 전처리 : 파생변수 인코딩

In [92]:
def esgRating_enc(x):
    try:
        dict_enc = {
            "S": "6",
            "A+": "5",
            "A": "4",
            "B+": "3",
            "B+ 이하": "3",
            "B": "2",
            "B이하": "2",
            "C": "1",
            "D": "0",
            "-": np.nan,
            "": np.nan,
            "NaN": np.nan,
            "None": np.nan
        }
        return dict_enc[x]
    except:
        return np.nan


col_esg = ["ESG종합", "E", "S", "G"]
for i in col_esg:
    df_esg[f"{i}_enc"] = df_esg[i].fillna(np.nan).apply(esgRating_enc).astype("category")
df_esg

Unnamed: 0,종목코드,종목명,연,ESG종합,E,S,G,ESG종합_enc,E_enc,S_enc,G_enc
0,,서울도시가스,2018,B,B,B,B,2,2,2,2
1,,티비에이치글로벌,2018,B,B,B,B,2,2,2,2
2,090080,평화산업,2018,B,B,B,B+,2,2,2,3
3,001020,페이퍼코리아,2018,B,C,B,B,2,1,2,2
4,016800,퍼시스,2018,B,B,B,B,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...
5649,004980,성신양회,2011,B,B+,B,B,2,3,2,2
5650,003080,성보화학,2011,,B,,,,2,,
5651,014910,성문전자,2011,B,C,B,B,2,1,2,2
5652,,선창산업,2011,B,B,B,B,2,2,2,2


## 영속화

In [100]:
m.DfPrst(df_esg, fp_esg)

['../data/esgRating.parquet']
