# 데이터 확인

In [None]:
# 구글 코랩에서 실행하기 위해 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 필요 라이브러리 호출
import os
from os.path import join
import copy
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import sklearn

import matplotlib.pyplot as plt

ROOT_PATH = '/content/drive/MyDrive/LS_Practice' # 데이터가 있는 디렉토리의 상위 디렉토리를 미리 지정

In [None]:
example_file = join(ROOT_PATH, 'data', 'Hospital', 'train.csv') # 해당 데이터가 존재하는 경로 생성

#### 데이터 정보

    train.csv - 의료기관이 폐업했는지 여부를 포함하여 최근 2개년의 재무정보와 병원 기본정보 
    test.csv - 폐업 여부를 제외하고 train.csv와 동일 
    sample_submission.csv - inst_id와 open과 close를 예측하는 OC 두개의 열로 구성. OC의 값은 open 예측일 경우 1, close 예측일 경우 0.



    inst_id - 각 파일에서의 병원 고유 번호
    OC – 영업/폐업 분류, 2018년 폐업은 2017년 폐업으로 간주함
    sido – 병원의 광역 지역 정보
    sgg – 병원의 시군구 자료
    openDate – 병원 설립일
    bedCount - 병원이 갖추고 있는 병상의 수
    instkind – 병원, 의원, 요양병원, 한의원, 종합병원 등 병원의 종류
    ·        종합병원 : 입원환자 100명 이상 수용 가능
    ·        병원 : 입원 환자 30명 이상 100명 미만 수용 가능
    ·        의원 : 입원 환자 30명 이하 수용 가능
    ·        한방 병원(한의원) : 침술과 한약으로 치료하는 의료 기관.  
    revenue1 – 매출액, 2017(회계년도)년 데이터를 의미함
    salescost1 – 매출원가, 2017(회계년도)년 데이터를 의미함
    sga1 - 판매비와 관리비, 2017(회계년도)년 데이터를 의미함
    salary1 – 급여, 2017(회계년도)년 데이터를 의미함
    noi1 – 영업외수익, 2017(회계년도)년 데이터를 의미함
    noe1 – 영업외비용, 2017(회계년도)년 데이터를 의미함
    Interest1 – 이자비용, 2017(회계년도)년 데이터를 의미함
    ctax1 – 법인세비용, 2017(회계년도)년 데이터를 의미함
    Profit1 – 당기순이익, 2017(회계년도)년 데이터를 의미함
    liquidAsset1 – 유동자산, 2017(회계년도)년 데이터를 의미함
    quickAsset1 – 당좌자산, 2017(회계년도)년 데이터를 의미함
    receivableS1 - 미수금(단기), 2017(회계년도)년 데이터를 의미함
    inventoryAsset1 – 재고자산, 2017(회계년도)년 데이터를 의미함
    nonCAsset1 – 비유동자산, 2017(회계년도)년 데이터를 의미함
    tanAsset1 – 유형자산, 2017(회계년도)년 데이터를 의미함
    OnonCAsset1 - 기타 비유동자산, 2017(회계년도)년 데이터를 의미함
    receivableL1 – 장기미수금, 2017(회계년도)년 데이터를 의미함
    debt1 – 부채총계, 2017(회계년도)년 데이터를 의미함
    liquidLiabilities1 – 유동부채, 2017(회계년도)년 데이터를 의미함
    shortLoan1 – 단기차입금, 2017(회계년도)년 데이터를 의미함
    NCLiabilities1 – 비유동부채, 2017(회계년도)년 데이터를 의미함
    longLoan1 – 장기차입금, 2017(회계년도)년 데이터를 의미함
    netAsset1 – 순자산총계, 2017(회계년도)년 데이터를 의미함
    surplus1 – 이익잉여금, 2017(회계년도)년 데이터를 의미함
    revenue2 – 매출액, 2016(회계년도)년 데이터를 의미함
    salescost2 – 매출원가, 2016(회계년도)년 데이터를 의미함
    sga2 - 판매비와 관리비, 2016(회계년도)년 데이터를 의미함
    salary2 – 급여, 2016(회계년도)년 데이터를 의미함
    noi2 – 영업외수익, 2016(회계년도)년 데이터를 의미함
    noe2 – 영업외비용, 2016(회계년도)년 데이터를 의미함
    interest2 – 이자비용, 2016(회계년도)년 데이터를 의미함
    ctax2 – 법인세비용, 2016(회계년도)년 데이터를 의미함
    profit2 – 당기순이익, 2016(회계년도)년 데이터를 의미함
    liquidAsset2 – 유동자산, 2016(회계년도)년 데이터를 의미함
    quickAsset2 – 당좌자산, 2016(회계년도)년 데이터를 의미함
    receivableS2 - 미수금(단기), 2016(회계년도)년 데이터를 의미함
    inventoryAsset2 – 재고자산, 2016(회계년도)년 데이터를 의미함
    nonCAsset2 – 비유동자산, 2016(회계년도)년 데이터를 의미함
    tanAsset2 – 유형자산, 2016(회계년도)년 데이터를 의미함
    OnonCAsset2 - 기타 비유동자산, 2016(회계년도)년 데이터를 의미함
    receivableL2 – 장기미수금, 2016(회계년도)년 데이터를 의미함
    Debt2 – 부채총계, 2016(회계년도)년 데이터를 의미함
    liquidLiabilities2 – 유동부채, 2016(회계년도)년 데이터를 의미함
    shortLoan2 – 단기차입금, 2016(회계년도)년 데이터를 의미함
    NCLiabilities2 – 비유동부채, 2016(회계년도)년 데이터를 의미함
    longLoan2 – 장기차입금, 2016(회계년도)년 데이터를 의미함
    netAsset2 – 순자산총계, 2016(회계년도)년 데이터를 의미함
    surplus2 – 이익잉여금, 2016(회계년도)년 데이터를 의미함
    employee1 – 고용한 총 직원의 수, 2017(회계년도)년 데이터를 의미함
    employee2 – 고용한 총 직원의 수, 2016(회계년도)년 데이터를 의미함
    ownerChange – 대표자의 변동 

In [None]:
data = pd.read_csv(example_file) # 파일 불러오기
label = data['OC'] # 데이터의 OC컬럼을 label로 지정

In [None]:
data.head() # 데이터의 첫 5줄 확인

Unnamed: 0,inst_id,OC,sido,sgg,openDate,bedCount,instkind,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2,ownerChange
0,1,open,choongnam,73,20071228,175.0,nursing_hospital,4217530000.0,0.0,3961135000.0,2033835000.0,15652441.0,15236240.0,13236240.0,31641798.0,225169678.0,1012700000.0,997671900.0,470055700.0,15027810.0,2514586000.0,2360684000.0,143449600.0,0.0,682826000.0,201323700.0,0.0,481502300.0,351000000.0,2844460000.0,1496394000.0,4297848000.0,0.0,4057422000.0,2063787000.0,16194675.0,29983350.0,15683050.0,18808074.0,207829685.0,830169500.0,816570500.0,523702600.0,13598970.0,2548115000.0,2386263000.0,145898600.0,0.0,758993700.0,222876900.0,0.0,536116900.0,390000000.0,2619290000.0,1271224000.0,62.0,64.0,same
1,3,open,gyeongnam,32,19970401,410.0,general_hospital,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,801.0,813.0,same
2,4,open,gyeonggi,89,20161228,468.0,nursing_hospital,1004522000.0,515483669.0,447219700.0,296402300.0,76156.0,30000.0,0.0,0.0,41864754.0,272442100.0,253682200.0,8095950.0,18759970.0,120481000.0,120481000.0,0.0,0.0,92414340.0,92414340.0,0.0,0.0,0.0,300508800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,1.0,same
3,7,open,incheon,141,20000814,353.0,general_hospital,72507340000.0,0.0,70677400000.0,31786050000.0,506223059.0,1259568000.0,1196881000.0,173769780.0,902830288.0,13041540000.0,11534750000.0,0.0,1506793000.0,43179360000.0,38320780000.0,3945208000.0,0.0,42363800000.0,27991880000.0,19885200000.0,14371920000.0,7253040000.0,13857100000.0,8643659000.0,66858340000.0,0.0,64924190000.0,29711350000.0,476807804.0,1353672000.0,1277422000.0,218891720.0,838387466.0,11125720000.0,9890540000.0,0.0,1235181000.0,39583560000.0,34855760000.0,3915906000.0,0.0,37755010000.0,17018600000.0,9219427000.0,20736410000.0,15100000000.0,12954270000.0,7740829000.0,663.0,663.0,same
4,9,open,gyeongnam,32,20050901,196.0,general_hospital,49043540000.0,0.0,47656050000.0,24460780000.0,112352259.0,1419089000.0,1307249000.0,0.0,80749696.0,6317084000.0,5873265000.0,4099320000.0,443818600.0,43667330000.0,43306130000.0,222340000.0,0.0,49898110000.0,28909190000.0,17930380000.0,20988920000.0,13500000000.0,86311640.0,9025550000.0,48082800000.0,0.0,47125800000.0,23460040000.0,597748128.0,1522108000.0,1349851000.0,0.0,32642585.0,4906776000.0,4464017000.0,3365227000.0,442759100.0,46531380000.0,45629450000.0,789340700.0,0.0,51432590000.0,30072590000.0,17593750000.0,21360010000.0,14108030000.0,5561941.0,9025550000.0,206.0,197.0,same


In [None]:
data.shape # 데이터의 형태 확인

(301, 58)

In [None]:
del data['OC'] # 데이터에서 OC 컬럼 제거

In [None]:
data.head() # 데이터 재확인

Unnamed: 0,inst_id,sido,sgg,openDate,bedCount,instkind,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2,ownerChange
0,1,choongnam,73,20071228,175.0,nursing_hospital,4217530000.0,0.0,3961135000.0,2033835000.0,15652441.0,15236240.0,13236240.0,31641798.0,225169678.0,1012700000.0,997671900.0,470055700.0,15027810.0,2514586000.0,2360684000.0,143449600.0,0.0,682826000.0,201323700.0,0.0,481502300.0,351000000.0,2844460000.0,1496394000.0,4297848000.0,0.0,4057422000.0,2063787000.0,16194675.0,29983350.0,15683050.0,18808074.0,207829685.0,830169500.0,816570500.0,523702600.0,13598970.0,2548115000.0,2386263000.0,145898600.0,0.0,758993700.0,222876900.0,0.0,536116900.0,390000000.0,2619290000.0,1271224000.0,62.0,64.0,same
1,3,gyeongnam,32,19970401,410.0,general_hospital,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,801.0,813.0,same
2,4,gyeonggi,89,20161228,468.0,nursing_hospital,1004522000.0,515483669.0,447219700.0,296402300.0,76156.0,30000.0,0.0,0.0,41864754.0,272442100.0,253682200.0,8095950.0,18759970.0,120481000.0,120481000.0,0.0,0.0,92414340.0,92414340.0,0.0,0.0,0.0,300508800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,1.0,same
3,7,incheon,141,20000814,353.0,general_hospital,72507340000.0,0.0,70677400000.0,31786050000.0,506223059.0,1259568000.0,1196881000.0,173769780.0,902830288.0,13041540000.0,11534750000.0,0.0,1506793000.0,43179360000.0,38320780000.0,3945208000.0,0.0,42363800000.0,27991880000.0,19885200000.0,14371920000.0,7253040000.0,13857100000.0,8643659000.0,66858340000.0,0.0,64924190000.0,29711350000.0,476807804.0,1353672000.0,1277422000.0,218891720.0,838387466.0,11125720000.0,9890540000.0,0.0,1235181000.0,39583560000.0,34855760000.0,3915906000.0,0.0,37755010000.0,17018600000.0,9219427000.0,20736410000.0,15100000000.0,12954270000.0,7740829000.0,663.0,663.0,same
4,9,gyeongnam,32,20050901,196.0,general_hospital,49043540000.0,0.0,47656050000.0,24460780000.0,112352259.0,1419089000.0,1307249000.0,0.0,80749696.0,6317084000.0,5873265000.0,4099320000.0,443818600.0,43667330000.0,43306130000.0,222340000.0,0.0,49898110000.0,28909190000.0,17930380000.0,20988920000.0,13500000000.0,86311640.0,9025550000.0,48082800000.0,0.0,47125800000.0,23460040000.0,597748128.0,1522108000.0,1349851000.0,0.0,32642585.0,4906776000.0,4464017000.0,3365227000.0,442759100.0,46531380000.0,45629450000.0,789340700.0,0.0,51432590000.0,30072590000.0,17593750000.0,21360010000.0,14108030000.0,5561941.0,9025550000.0,206.0,197.0,same


In [None]:
data.describe() # 데이터의 기초 통계량 확인

Unnamed: 0,inst_id,sgg,openDate,bedCount,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2
count,301.0,301.0,301.0,296.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,291.0,288.0
mean,219.056478,81.039867,20050130.0,145.709459,12881750000.0,2014903000.0,10332440000.0,5654115000.0,269615100.0,511587000.0,186896600.0,78473220.0,210170000.0,3536587000.0,3414762000.0,697586200.0,115453300.0,11406730000.0,10534080000.0,340650300.0,251680.2,8765310000.0,4352925000.0,1758237000.0,4583473000.0,2646564000.0,5938213000.0,1265935000.0,11709050000.0,1842095000.0,9203487000.0,5023230000.0,222381300.0,561766900.0,173636000.0,68322590.0,256343900.0,3222852000.0,3106104000.0,610307000.0,106366700.0,10452140000.0,9651555000.0,323292100.0,222670.0,8146026000.0,3860584000.0,1510050000.0,4471247000.0,2709979000.0,5273919000.0,978627900.0,142.546392,134.326389
std,121.234869,50.969714,88938.15,118.92389,20435430000.0,7460271000.0,14938860000.0,8083343000.0,802370100.0,1060379000.0,273865800.0,248610200.0,1134673000.0,6285995000.0,6093581000.0,1455332000.0,255170200.0,17105160000.0,15745250000.0,935938100.0,4308072.0,12861120000.0,6474262000.0,3194540000.0,7847530000.0,4697523000.0,12660350000.0,8248321000.0,18949980000.0,6744178000.0,13525670000.0,7406530000.0,527463700.0,1121856000.0,269389400.0,242851200.0,1111963000.0,6015755000.0,5823176000.0,1455249000.0,249009900.0,16495380000.0,15217920000.0,878435500.0,3811496.0,12558000000.0,6797242000.0,2953412000.0,7659580000.0,4564001000.0,10812590000.0,4688798000.0,160.191073,151.061786
min,1.0,1.0,19780120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4696701000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-20196240000.0,-2914970000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4064823000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-20147800000.0,-2781507000.0,0.0,0.0
25%,112.0,37.0,20011020.0,52.75,3252112000.0,0.0,2758201000.0,1626053000.0,8217133.0,80133950.0,50664740.0,0.0,-2168279.0,719925300.0,680651200.0,0.0,4177116.0,2900012000.0,2700124000.0,968000.0,0.0,1953077000.0,511012100.0,0.0,428500000.0,0.0,1224795000.0,0.0,2731342000.0,0.0,2253260000.0,1347918000.0,6453360.0,64731680.0,31429690.0,0.0,0.0,470362300.0,452686700.0,0.0,2054789.0,2479446000.0,2338407000.0,0.0,0.0,1283220000.0,285574100.0,0.0,25578780.0,0.0,1017573000.0,0.0,53.5,53.75
50%,230.0,75.0,20071130.0,136.5,5524218000.0,210410500.0,4684074000.0,2659892000.0,43637640.0,183196500.0,109732900.0,1024900.0,81307510.0,1516732000.0,1473801000.0,215856400.0,24083070.0,6100799000.0,5549397000.0,35825260.0,0.0,4457667000.0,1754752000.0,229570400.0,2036935000.0,1136776000.0,3033907000.0,0.0,5005326000.0,173299400.0,4170070000.0,2365338000.0,43262280.0,171601500.0,94223000.0,1006880.0,101169100.0,1234392000.0,1199564000.0,116044900.0,24564310.0,5591346000.0,5124338000.0,27336000.0,0.0,3784553000.0,1454050000.0,85425490.0,1969747000.0,1100000000.0,2894970000.0,0.0,80.0,79.0
75%,321.0,123.0,20111020.0,193.0,12748390000.0,910527800.0,10778760000.0,6363400000.0,205033100.0,420333000.0,224230800.0,44353960.0,293032900.0,3379067000.0,3273841000.0,694060500.0,105226400.0,12973120000.0,12154990000.0,203820000.0,0.0,9437917000.0,5415331000.0,2139742000.0,4901485000.0,3327098000.0,5982550000.0,126160000.0,11956900000.0,842205900.0,9806518000.0,5564048000.0,161976100.0,477447300.0,199357400.0,33234030.0,319725600.0,2959041000.0,2865958000.0,627382500.0,80868500.0,11777320000.0,10972570000.0,233785000.0,0.0,8465053000.0,4364714000.0,1567967000.0,4905441000.0,3360000000.0,5370285000.0,101444700.0,181.5,170.0
max,428.0,178.0,20170610.0,656.0,151000000000.0,98503320000.0,103000000000.0,64035590000.0,9144171000.0,8686380000.0,2841475000.0,2808883000.0,7475427000.0,42808360000.0,41791460000.0,14092570000.0,2067764000.0,156000000000.0,156000000000.0,10085170000.0,73742310.0,86697250000.0,39373880000.0,19885200000.0,55017360000.0,41124900000.0,139000000000.0,134000000000.0,137000000000.0,86581590000.0,98378400000.0,61279110000.0,4035895000.0,8826977000.0,2593366000.0,2790916000.0,10270370000.0,40818690000.0,39705370000.0,16342540000.0,1812222000.0,164000000000.0,163000000000.0,8767966000.0,65242310.0,85088580000.0,68468780000.0,17593750000.0,51503880000.0,32561470000.0,125000000000.0,68527300000.0,1200.0,1200.0


In [None]:
data.info() # 데이터의 요약 정보 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 57 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   inst_id             301 non-null    int64  
 1   sido                301 non-null    object 
 2   sgg                 301 non-null    int64  
 3   openDate            301 non-null    int64  
 4   bedCount            296 non-null    float64
 5   instkind            300 non-null    object 
 6   revenue1            293 non-null    float64
 7   salescost1          293 non-null    float64
 8   sga1                293 non-null    float64
 9   salary1             293 non-null    float64
 10  noi1                293 non-null    float64
 11  noe1                293 non-null    float64
 12  interest1           293 non-null    float64
 13  ctax1               293 non-null    float64
 14  profit1             293 non-null    float64
 15  liquidAsset1        293 non-null    float64
 16  quickAss

In [None]:
# DF.info에서 Object type을 가진 컬럼은 모두 범주형 변수
cat_columns = ['sido', 'instkind', 'ownerChange']
num_columns = [c for c in data.columns if c not in cat_columns]
print('Categorical Columns: \n{}\n\n Numeric Columns: \n{}\n'.format(cat_columns, num_columns))

Categorical Columns: 
['sido', 'instkind', 'ownerChange']

 Numeric Columns: 
['inst_id', 'sgg', 'openDate', 'bedCount', 'revenue1', 'salescost1', 'sga1', 'salary1', 'noi1', 'noe1', 'interest1', 'ctax1', 'profit1', 'liquidAsset1', 'quickAsset1', 'receivableS1', 'inventoryAsset1', 'nonCAsset1', 'tanAsset1', 'OnonCAsset1', 'receivableL1', 'debt1', 'liquidLiabilities1', 'shortLoan1', 'NCLiabilities1', 'longLoan1', 'netAsset1', 'surplus1', 'revenue2', 'salescost2', 'sga2', 'salary2', 'noi2', 'noe2', 'interest2', 'ctax2', 'profit2', 'liquidAsset2', 'quickAsset2', 'receivableS2', 'inventoryAsset2', 'nonCAsset2', 'tanAsset2', 'OnonCAsset2', 'receivableL2', 'debt2', 'liquidLiabilities2', 'shortLoan2', 'NCLiabilities2', 'longLoan2', 'netAsset2', 'surplus2', 'employee1', 'employee2']



# Scaling

In [None]:
# 수치형 데이터의 값만 따로 저장
numeric_data = data[num_columns].values

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() # minmaxscaling을 위한 객체 생성

In [None]:
scaler.fit(numeric_data) # 스케일링 진행

MinMaxScaler()

In [None]:
scaled_data = scaler.transform(numeric_data) # 스케일링된 데이터를 다시 원래 형태로 변환
scaled_data = pd.DataFrame(scaled_data, columns=num_columns) # 데이터프레임으로 저장

In [None]:
data[num_columns].head()

Unnamed: 0,inst_id,sgg,openDate,bedCount,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2
0,1,73,20071228,175.0,4217530000.0,0.0,3961135000.0,2033835000.0,15652441.0,15236240.0,13236240.0,31641798.0,225169678.0,1012700000.0,997671900.0,470055700.0,15027810.0,2514586000.0,2360684000.0,143449600.0,0.0,682826000.0,201323700.0,0.0,481502300.0,351000000.0,2844460000.0,1496394000.0,4297848000.0,0.0,4057422000.0,2063787000.0,16194675.0,29983350.0,15683050.0,18808074.0,207829685.0,830169500.0,816570500.0,523702600.0,13598970.0,2548115000.0,2386263000.0,145898600.0,0.0,758993700.0,222876900.0,0.0,536116900.0,390000000.0,2619290000.0,1271224000.0,62.0,64.0
1,3,32,19970401,410.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,801.0,813.0
2,4,89,20161228,468.0,1004522000.0,515483669.0,447219700.0,296402300.0,76156.0,30000.0,0.0,0.0,41864754.0,272442100.0,253682200.0,8095950.0,18759970.0,120481000.0,120481000.0,0.0,0.0,92414340.0,92414340.0,0.0,0.0,0.0,300508800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,1.0
3,7,141,20000814,353.0,72507340000.0,0.0,70677400000.0,31786050000.0,506223059.0,1259568000.0,1196881000.0,173769780.0,902830288.0,13041540000.0,11534750000.0,0.0,1506793000.0,43179360000.0,38320780000.0,3945208000.0,0.0,42363800000.0,27991880000.0,19885200000.0,14371920000.0,7253040000.0,13857100000.0,8643659000.0,66858340000.0,0.0,64924190000.0,29711350000.0,476807804.0,1353672000.0,1277422000.0,218891720.0,838387466.0,11125720000.0,9890540000.0,0.0,1235181000.0,39583560000.0,34855760000.0,3915906000.0,0.0,37755010000.0,17018600000.0,9219427000.0,20736410000.0,15100000000.0,12954270000.0,7740829000.0,663.0,663.0
4,9,32,20050901,196.0,49043540000.0,0.0,47656050000.0,24460780000.0,112352259.0,1419089000.0,1307249000.0,0.0,80749696.0,6317084000.0,5873265000.0,4099320000.0,443818600.0,43667330000.0,43306130000.0,222340000.0,0.0,49898110000.0,28909190000.0,17930380000.0,20988920000.0,13500000000.0,86311640.0,9025550000.0,48082800000.0,0.0,47125800000.0,23460040000.0,597748128.0,1522108000.0,1349851000.0,0.0,32642585.0,4906776000.0,4464017000.0,3365227000.0,442759100.0,46531380000.0,45629450000.0,789340700.0,0.0,51432590000.0,30072590000.0,17593750000.0,21360010000.0,14108030000.0,5561941.0,9025550000.0,206.0,197.0


In [None]:
data[num_columns].describe()

Unnamed: 0,inst_id,sgg,openDate,bedCount,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2
count,301.0,301.0,301.0,296.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,291.0,288.0
mean,219.056478,81.039867,20050130.0,145.709459,12881750000.0,2014903000.0,10332440000.0,5654115000.0,269615100.0,511587000.0,186896600.0,78473220.0,210170000.0,3536587000.0,3414762000.0,697586200.0,115453300.0,11406730000.0,10534080000.0,340650300.0,251680.2,8765310000.0,4352925000.0,1758237000.0,4583473000.0,2646564000.0,5938213000.0,1265935000.0,11709050000.0,1842095000.0,9203487000.0,5023230000.0,222381300.0,561766900.0,173636000.0,68322590.0,256343900.0,3222852000.0,3106104000.0,610307000.0,106366700.0,10452140000.0,9651555000.0,323292100.0,222670.0,8146026000.0,3860584000.0,1510050000.0,4471247000.0,2709979000.0,5273919000.0,978627900.0,142.546392,134.326389
std,121.234869,50.969714,88938.15,118.92389,20435430000.0,7460271000.0,14938860000.0,8083343000.0,802370100.0,1060379000.0,273865800.0,248610200.0,1134673000.0,6285995000.0,6093581000.0,1455332000.0,255170200.0,17105160000.0,15745250000.0,935938100.0,4308072.0,12861120000.0,6474262000.0,3194540000.0,7847530000.0,4697523000.0,12660350000.0,8248321000.0,18949980000.0,6744178000.0,13525670000.0,7406530000.0,527463700.0,1121856000.0,269389400.0,242851200.0,1111963000.0,6015755000.0,5823176000.0,1455249000.0,249009900.0,16495380000.0,15217920000.0,878435500.0,3811496.0,12558000000.0,6797242000.0,2953412000.0,7659580000.0,4564001000.0,10812590000.0,4688798000.0,160.191073,151.061786
min,1.0,1.0,19780120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4696701000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-20196240000.0,-2914970000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4064823000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-20147800000.0,-2781507000.0,0.0,0.0
25%,112.0,37.0,20011020.0,52.75,3252112000.0,0.0,2758201000.0,1626053000.0,8217133.0,80133950.0,50664740.0,0.0,-2168279.0,719925300.0,680651200.0,0.0,4177116.0,2900012000.0,2700124000.0,968000.0,0.0,1953077000.0,511012100.0,0.0,428500000.0,0.0,1224795000.0,0.0,2731342000.0,0.0,2253260000.0,1347918000.0,6453360.0,64731680.0,31429690.0,0.0,0.0,470362300.0,452686700.0,0.0,2054789.0,2479446000.0,2338407000.0,0.0,0.0,1283220000.0,285574100.0,0.0,25578780.0,0.0,1017573000.0,0.0,53.5,53.75
50%,230.0,75.0,20071130.0,136.5,5524218000.0,210410500.0,4684074000.0,2659892000.0,43637640.0,183196500.0,109732900.0,1024900.0,81307510.0,1516732000.0,1473801000.0,215856400.0,24083070.0,6100799000.0,5549397000.0,35825260.0,0.0,4457667000.0,1754752000.0,229570400.0,2036935000.0,1136776000.0,3033907000.0,0.0,5005326000.0,173299400.0,4170070000.0,2365338000.0,43262280.0,171601500.0,94223000.0,1006880.0,101169100.0,1234392000.0,1199564000.0,116044900.0,24564310.0,5591346000.0,5124338000.0,27336000.0,0.0,3784553000.0,1454050000.0,85425490.0,1969747000.0,1100000000.0,2894970000.0,0.0,80.0,79.0
75%,321.0,123.0,20111020.0,193.0,12748390000.0,910527800.0,10778760000.0,6363400000.0,205033100.0,420333000.0,224230800.0,44353960.0,293032900.0,3379067000.0,3273841000.0,694060500.0,105226400.0,12973120000.0,12154990000.0,203820000.0,0.0,9437917000.0,5415331000.0,2139742000.0,4901485000.0,3327098000.0,5982550000.0,126160000.0,11956900000.0,842205900.0,9806518000.0,5564048000.0,161976100.0,477447300.0,199357400.0,33234030.0,319725600.0,2959041000.0,2865958000.0,627382500.0,80868500.0,11777320000.0,10972570000.0,233785000.0,0.0,8465053000.0,4364714000.0,1567967000.0,4905441000.0,3360000000.0,5370285000.0,101444700.0,181.5,170.0
max,428.0,178.0,20170610.0,656.0,151000000000.0,98503320000.0,103000000000.0,64035590000.0,9144171000.0,8686380000.0,2841475000.0,2808883000.0,7475427000.0,42808360000.0,41791460000.0,14092570000.0,2067764000.0,156000000000.0,156000000000.0,10085170000.0,73742310.0,86697250000.0,39373880000.0,19885200000.0,55017360000.0,41124900000.0,139000000000.0,134000000000.0,137000000000.0,86581590000.0,98378400000.0,61279110000.0,4035895000.0,8826977000.0,2593366000.0,2790916000.0,10270370000.0,40818690000.0,39705370000.0,16342540000.0,1812222000.0,164000000000.0,163000000000.0,8767966000.0,65242310.0,85088580000.0,68468780000.0,17593750000.0,51503880000.0,32561470000.0,125000000000.0,68527300000.0,1200.0,1200.0


In [None]:
scaled_data.head()

Unnamed: 0,inst_id,sgg,openDate,bedCount,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2
0,0.0,0.40678,0.745497,0.266768,0.027931,0.0,0.038458,0.031761,0.001712,0.001754,0.004658,0.011265,0.404356,0.023657,0.023873,0.033355,0.007268,0.016119,0.015133,0.014224,0.0,0.007876,0.005113,0.0,0.008752,0.008535,0.144731,0.03222,0.031371,0.0,0.041243,0.033678,0.004013,0.003397,0.006047,0.006739,0.298053,0.020338,0.020566,0.032045,0.007504,0.015537,0.01464,0.01664,0.0,0.00892,0.003255,0.0,0.010409,0.011977,0.156855,0.056834,0.051667,0.053333
1,0.004684,0.175141,0.487286,0.625,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.6675,0.6775
2,0.007026,0.497175,0.975981,0.713415,0.006652,0.005233,0.004342,0.004629,8e-06,3e-06,0.0,0.0,0.389296,0.006364,0.00607,0.000574,0.009073,0.000772,0.000772,0.0,0.0,0.001066,0.002347,0.0,0.0,0.0,0.128751,0.02129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.283556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138809,0.039006,0.195,0.000833
3,0.014052,0.79096,0.565172,0.53811,0.480181,0.0,0.686188,0.496381,0.05536,0.145005,0.421218,0.061864,0.460029,0.304649,0.276007,0.0,0.728707,0.276791,0.245646,0.391189,0.0,0.488641,0.710925,1.0,0.261225,0.176366,0.213908,0.084422,0.488017,0.0,0.659944,0.484853,0.118142,0.153356,0.492573,0.07843,0.34204,0.272564,0.249098,0.0,0.681584,0.241363,0.213839,0.446615,0.0,0.443714,0.24856,0.524017,0.402618,0.463738,0.228058,0.14756,0.5525,0.5525
4,0.018735,0.175141,0.693441,0.29878,0.324792,0.0,0.46268,0.381987,0.012287,0.163369,0.46006,0.0,0.392491,0.147567,0.140537,0.290885,0.214637,0.279919,0.277603,0.022046,0.0,0.575544,0.734222,0.901695,0.381496,0.328268,0.127406,0.087211,0.350969,0.0,0.479026,0.382839,0.148108,0.172438,0.520502,0.0,0.285833,0.120209,0.112429,0.205918,0.244318,0.283728,0.279935,0.090026,0.0,0.604459,0.439216,1.0,0.414726,0.433274,0.138847,0.165576,0.171667,0.164167


In [None]:
scaled_data.describe()

Unnamed: 0,inst_id,sgg,openDate,bedCount,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2
count,301.0,301.0,301.0,296.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,291.0,288.0
mean,0.510671,0.452203,0.691456,0.222118,0.08531,0.020455,0.100315,0.088296,0.029485,0.058895,0.065774,0.027938,0.403124,0.082614,0.08171,0.0495,0.055835,0.07312,0.067526,0.033777,0.003413,0.101103,0.110554,0.088419,0.08331,0.064354,0.164165,0.030537,0.085468,0.021276,0.093552,0.081973,0.055101,0.063642,0.066954,0.02448,0.301438,0.078955,0.078229,0.037345,0.058694,0.063733,0.059212,0.036872,0.003413,0.095736,0.056385,0.085829,0.086814,0.083227,0.175144,0.05273,0.118789,0.111939
std,0.283922,0.287964,0.227764,0.181286,0.135334,0.075736,0.145037,0.126232,0.087747,0.122074,0.096382,0.088509,0.093219,0.14684,0.145809,0.103269,0.123404,0.109648,0.100931,0.092803,0.058421,0.148345,0.16443,0.160649,0.142637,0.114226,0.079527,0.060244,0.138321,0.077894,0.137486,0.120865,0.130693,0.127094,0.103876,0.087015,0.077569,0.147377,0.14666,0.089047,0.137406,0.100582,0.093361,0.100187,0.058421,0.147587,0.099275,0.167867,0.148719,0.140166,0.074494,0.065753,0.133493,0.125885
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.259953,0.20339,0.591301,0.080412,0.021537,0.0,0.026779,0.025393,0.000899,0.009225,0.01783,0.0,0.385679,0.016817,0.016287,0.0,0.00202,0.01859,0.017308,9.6e-05,0.0,0.022528,0.012978,0.0,0.007788,0.0,0.134557,0.02129,0.019937,0.0,0.022904,0.021996,0.001599,0.007333,0.012119,0.0,0.283556,0.011523,0.011401,0.0,0.001134,0.015119,0.014346,0.0,0.0,0.015081,0.004171,0.0,0.000497,0.0,0.145819,0.039006,0.044583,0.044792
50%,0.5363,0.418079,0.745236,0.208079,0.036584,0.002136,0.045476,0.041538,0.004772,0.02109,0.038618,0.000365,0.392537,0.035431,0.035266,0.015317,0.011647,0.039108,0.035573,0.003552,0.0,0.051416,0.044566,0.011545,0.037023,0.027642,0.145921,0.02129,0.036535,0.002002,0.042388,0.038599,0.010719,0.019441,0.036332,0.000361,0.290613,0.030241,0.030212,0.007101,0.013555,0.034094,0.031438,0.003118,0.0,0.044478,0.021237,0.004855,0.038245,0.033782,0.158754,0.039006,0.066667,0.065833
75%,0.749415,0.689266,0.847404,0.294207,0.084426,0.009244,0.104648,0.099373,0.022422,0.04839,0.078914,0.015791,0.409931,0.078935,0.078338,0.04925,0.050889,0.083161,0.077917,0.02021,0.0,0.108861,0.137536,0.107605,0.08909,0.080902,0.164444,0.022212,0.087277,0.009727,0.099682,0.090798,0.040134,0.05409,0.076872,0.011908,0.305859,0.072492,0.072181,0.03839,0.044624,0.071813,0.067316,0.026664,0.0,0.099485,0.063748,0.089121,0.095244,0.103189,0.175808,0.040429,0.15125,0.141667
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() # standardscaling을 위한 객체 생성

In [None]:
scaler.fit(numeric_data) # 스케일링 진행

StandardScaler()

In [None]:
scaled_data = scaler.transform(numeric_data) # 원래 형태로 변환
scaled_data = pd.DataFrame(scaled_data, columns=num_columns) # 데이터프레임으로 저장

In [None]:
data[num_columns].head()

Unnamed: 0,inst_id,sgg,openDate,bedCount,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2
0,1,73,20071228,175.0,4217530000.0,0.0,3961135000.0,2033835000.0,15652441.0,15236240.0,13236240.0,31641798.0,225169678.0,1012700000.0,997671900.0,470055700.0,15027810.0,2514586000.0,2360684000.0,143449600.0,0.0,682826000.0,201323700.0,0.0,481502300.0,351000000.0,2844460000.0,1496394000.0,4297848000.0,0.0,4057422000.0,2063787000.0,16194675.0,29983350.0,15683050.0,18808074.0,207829685.0,830169500.0,816570500.0,523702600.0,13598970.0,2548115000.0,2386263000.0,145898600.0,0.0,758993700.0,222876900.0,0.0,536116900.0,390000000.0,2619290000.0,1271224000.0,62.0,64.0
1,3,32,19970401,410.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,801.0,813.0
2,4,89,20161228,468.0,1004522000.0,515483669.0,447219700.0,296402300.0,76156.0,30000.0,0.0,0.0,41864754.0,272442100.0,253682200.0,8095950.0,18759970.0,120481000.0,120481000.0,0.0,0.0,92414340.0,92414340.0,0.0,0.0,0.0,300508800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,1.0
3,7,141,20000814,353.0,72507340000.0,0.0,70677400000.0,31786050000.0,506223059.0,1259568000.0,1196881000.0,173769780.0,902830288.0,13041540000.0,11534750000.0,0.0,1506793000.0,43179360000.0,38320780000.0,3945208000.0,0.0,42363800000.0,27991880000.0,19885200000.0,14371920000.0,7253040000.0,13857100000.0,8643659000.0,66858340000.0,0.0,64924190000.0,29711350000.0,476807804.0,1353672000.0,1277422000.0,218891720.0,838387466.0,11125720000.0,9890540000.0,0.0,1235181000.0,39583560000.0,34855760000.0,3915906000.0,0.0,37755010000.0,17018600000.0,9219427000.0,20736410000.0,15100000000.0,12954270000.0,7740829000.0,663.0,663.0
4,9,32,20050901,196.0,49043540000.0,0.0,47656050000.0,24460780000.0,112352259.0,1419089000.0,1307249000.0,0.0,80749696.0,6317084000.0,5873265000.0,4099320000.0,443818600.0,43667330000.0,43306130000.0,222340000.0,0.0,49898110000.0,28909190000.0,17930380000.0,20988920000.0,13500000000.0,86311640.0,9025550000.0,48082800000.0,0.0,47125800000.0,23460040000.0,597748128.0,1522108000.0,1349851000.0,0.0,32642585.0,4906776000.0,4464017000.0,3365227000.0,442759100.0,46531380000.0,45629450000.0,789340700.0,0.0,51432590000.0,30072590000.0,17593750000.0,21360010000.0,14108030000.0,5561941.0,9025550000.0,206.0,197.0


In [None]:
data[num_columns].describe()

Unnamed: 0,inst_id,sgg,openDate,bedCount,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2
count,301.0,301.0,301.0,296.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,291.0,288.0
mean,219.056478,81.039867,20050130.0,145.709459,12881750000.0,2014903000.0,10332440000.0,5654115000.0,269615100.0,511587000.0,186896600.0,78473220.0,210170000.0,3536587000.0,3414762000.0,697586200.0,115453300.0,11406730000.0,10534080000.0,340650300.0,251680.2,8765310000.0,4352925000.0,1758237000.0,4583473000.0,2646564000.0,5938213000.0,1265935000.0,11709050000.0,1842095000.0,9203487000.0,5023230000.0,222381300.0,561766900.0,173636000.0,68322590.0,256343900.0,3222852000.0,3106104000.0,610307000.0,106366700.0,10452140000.0,9651555000.0,323292100.0,222670.0,8146026000.0,3860584000.0,1510050000.0,4471247000.0,2709979000.0,5273919000.0,978627900.0,142.546392,134.326389
std,121.234869,50.969714,88938.15,118.92389,20435430000.0,7460271000.0,14938860000.0,8083343000.0,802370100.0,1060379000.0,273865800.0,248610200.0,1134673000.0,6285995000.0,6093581000.0,1455332000.0,255170200.0,17105160000.0,15745250000.0,935938100.0,4308072.0,12861120000.0,6474262000.0,3194540000.0,7847530000.0,4697523000.0,12660350000.0,8248321000.0,18949980000.0,6744178000.0,13525670000.0,7406530000.0,527463700.0,1121856000.0,269389400.0,242851200.0,1111963000.0,6015755000.0,5823176000.0,1455249000.0,249009900.0,16495380000.0,15217920000.0,878435500.0,3811496.0,12558000000.0,6797242000.0,2953412000.0,7659580000.0,4564001000.0,10812590000.0,4688798000.0,160.191073,151.061786
min,1.0,1.0,19780120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4696701000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-20196240000.0,-2914970000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4064823000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-20147800000.0,-2781507000.0,0.0,0.0
25%,112.0,37.0,20011020.0,52.75,3252112000.0,0.0,2758201000.0,1626053000.0,8217133.0,80133950.0,50664740.0,0.0,-2168279.0,719925300.0,680651200.0,0.0,4177116.0,2900012000.0,2700124000.0,968000.0,0.0,1953077000.0,511012100.0,0.0,428500000.0,0.0,1224795000.0,0.0,2731342000.0,0.0,2253260000.0,1347918000.0,6453360.0,64731680.0,31429690.0,0.0,0.0,470362300.0,452686700.0,0.0,2054789.0,2479446000.0,2338407000.0,0.0,0.0,1283220000.0,285574100.0,0.0,25578780.0,0.0,1017573000.0,0.0,53.5,53.75
50%,230.0,75.0,20071130.0,136.5,5524218000.0,210410500.0,4684074000.0,2659892000.0,43637640.0,183196500.0,109732900.0,1024900.0,81307510.0,1516732000.0,1473801000.0,215856400.0,24083070.0,6100799000.0,5549397000.0,35825260.0,0.0,4457667000.0,1754752000.0,229570400.0,2036935000.0,1136776000.0,3033907000.0,0.0,5005326000.0,173299400.0,4170070000.0,2365338000.0,43262280.0,171601500.0,94223000.0,1006880.0,101169100.0,1234392000.0,1199564000.0,116044900.0,24564310.0,5591346000.0,5124338000.0,27336000.0,0.0,3784553000.0,1454050000.0,85425490.0,1969747000.0,1100000000.0,2894970000.0,0.0,80.0,79.0
75%,321.0,123.0,20111020.0,193.0,12748390000.0,910527800.0,10778760000.0,6363400000.0,205033100.0,420333000.0,224230800.0,44353960.0,293032900.0,3379067000.0,3273841000.0,694060500.0,105226400.0,12973120000.0,12154990000.0,203820000.0,0.0,9437917000.0,5415331000.0,2139742000.0,4901485000.0,3327098000.0,5982550000.0,126160000.0,11956900000.0,842205900.0,9806518000.0,5564048000.0,161976100.0,477447300.0,199357400.0,33234030.0,319725600.0,2959041000.0,2865958000.0,627382500.0,80868500.0,11777320000.0,10972570000.0,233785000.0,0.0,8465053000.0,4364714000.0,1567967000.0,4905441000.0,3360000000.0,5370285000.0,101444700.0,181.5,170.0
max,428.0,178.0,20170610.0,656.0,151000000000.0,98503320000.0,103000000000.0,64035590000.0,9144171000.0,8686380000.0,2841475000.0,2808883000.0,7475427000.0,42808360000.0,41791460000.0,14092570000.0,2067764000.0,156000000000.0,156000000000.0,10085170000.0,73742310.0,86697250000.0,39373880000.0,19885200000.0,55017360000.0,41124900000.0,139000000000.0,134000000000.0,137000000000.0,86581590000.0,98378400000.0,61279110000.0,4035895000.0,8826977000.0,2593366000.0,2790916000.0,10270370000.0,40818690000.0,39705370000.0,16342540000.0,1812222000.0,164000000000.0,163000000000.0,8767966000.0,65242310.0,85088580000.0,68468780000.0,17593750000.0,51503880000.0,32561470000.0,125000000000.0,68527300000.0,1200.0,1200.0


In [None]:
scaled_data.head()

Unnamed: 0,inst_id,sgg,openDate,bedCount,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2
0,-1.801624,-0.158001,0.237663,0.246714,-0.424705,-0.270546,-0.427222,-0.448635,-0.317057,-0.468889,-0.635192,-0.188695,0.013242,-0.402196,-0.39734,-0.15661,-0.394236,-0.520741,-0.519991,-0.211059,-0.058521,-0.629518,-0.642344,-0.55133,-0.523603,-0.489511,-0.244784,0.027988,-0.391762,-0.273606,-0.381118,-0.400256,-0.391571,-0.474832,-0.58734,-0.204237,-0.043704,-0.398417,-0.393849,-0.059614,-0.373184,-0.479986,-0.478234,-0.202288,-0.058521,-0.58924,-0.53609,-0.512165,-0.514632,-0.509191,-0.245933,0.06251,-0.503681,-0.466358
1,-1.785099,-0.96374,-0.8979,2.226114,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.117507,4.500509
2,-1.776837,0.156434,1.251288,2.714647,-0.582202,-0.201331,-0.662844,-0.663943,-0.336503,-0.483254,-0.683606,-0.316188,-0.148583,-0.520161,-0.519643,-0.474579,-0.379585,-0.660944,-0.662512,-0.364589,-0.058521,-0.675504,-0.659195,-0.55133,-0.585065,-0.564359,-0.446066,-0.15374,-0.61895,-0.273606,-0.68161,-0.679377,-0.422326,-0.501605,-0.645657,-0.281816,-0.230927,-0.536652,-0.534316,-0.420101,-0.427889,-0.634724,-0.635308,-0.368661,-0.058521,-0.649782,-0.568935,-0.512165,-0.584744,-0.594788,-0.488592,-0.209073,0.571887,-0.884131
3,-1.75205,1.178346,-0.555374,1.746004,2.922747,-0.270546,4.046372,3.238344,0.295391,0.706596,3.694191,0.383973,0.611493,1.514672,1.334828,-0.480151,5.461925,1.860666,1.767786,3.857867,-0.058521,2.616877,3.657467,5.684066,1.249462,0.982296,0.626557,0.895982,2.915235,-0.273606,4.126675,3.338993,0.483184,0.707096,4.104372,0.621066,0.524333,1.315943,1.167068,-0.420101,4.540965,1.769056,1.659053,4.096784,-0.058521,2.361813,1.939099,2.614795,2.127139,2.719372,0.711531,1.444671,3.254552,3.50581
4,-1.735526,-0.96374,0.008731,0.423596,1.772591,-0.270546,2.502698,2.330576,-0.196333,0.857292,4.097879,-0.316188,-0.114255,0.443089,0.404148,2.341428,1.289049,1.889243,2.084954,-0.126625,-0.058521,3.2037,3.799395,5.071093,2.0941,2.314412,-0.463013,0.94236,1.922745,-0.273606,2.808526,2.493523,0.712863,0.857494,4.373696,-0.281816,-0.201521,0.280398,0.23359,1.896331,1.353231,2.190975,2.368225,0.531452,-0.058521,3.45283,3.862868,5.45512,2.208692,2.501653,-0.488076,1.719138,0.396794,0.415609


In [None]:
scaled_data.describe()

Unnamed: 0,inst_id,sgg,openDate,bedCount,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2
count,301.0,301.0,301.0,296.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,291.0,288.0
mean,2.301592e-16,-8.218325e-17,9.617076e-15,-3.8445220000000005e-17,1.401988e-17,-1.9135240000000002e-17,-4.2438560000000006e-17,-2.3113860000000003e-17,-3.107109e-17,-4.6038260000000004e-17,2.349277e-17,-1.364097e-16,7.294127e-18,5.0016870000000006e-17,-1.2883130000000001e-17,1.231476e-16,1.114012e-16,2.903442e-17,-2.6962220000000002e-17,-7.767772e-17,1.254448e-16,4.528043e-17,-8.535076000000001e-17,-6.290001000000001e-17,6.100543e-17,5.1295710000000005e-17,3.98572e-17,-7.199398000000001e-17,5.759519e-17,-1.498612e-16,-8.942411e-17,-9.018194000000001e-17,-3.4102410000000006e-17,-3.258675e-17,-6.612079000000001e-17,1.163271e-16,-5.2479830000000004e-17,1.326205e-18,4.281748e-17,2.0082530000000002e-17,-3.021853e-17,7.068258e-17,-8.710325000000001e-17,-1.360307e-16,1.254448e-16,-5.626898e-17,-2.936597e-17,-2.462952e-16,6.725754000000001e-17,-1.34894e-16,2.041408e-17,-1.288313e-16,7.973767e-17,-1.098658e-16
std,1.001665,1.001665,1.001665,1.001693,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001711,1.001723,1.001741
min,-1.801624,-1.572957,-3.040893,-1.227308,-0.6314418,-0.2705465,-0.6928316,-0.700674,-0.3365983,-0.4832821,-0.6836061,-0.3161876,-4.331877,-0.5635762,-0.5613454,-0.4801514,-0.4532303,-0.6679998,-0.670177,-0.3645894,-0.05852057,-0.6827015,-0.6734933,-0.55133,-0.5850649,-0.5643594,-2.067807,-0.5077467,-0.6189499,-0.2736059,-0.6816102,-0.6793767,-0.4223262,-0.5016046,-0.6456566,-0.2818165,-3.89272,-0.5366519,-0.5343164,-0.4201008,-0.4278894,-0.6347245,-0.6353082,-0.3686614,-0.05852057,-0.6497823,-0.5689349,-0.5121648,-0.5847444,-0.5947885,-2.355144,-0.803312,-0.8913852,-0.8907627
25%,-0.8845208,-0.8654788,-0.4404626,-0.7829956,-0.4720287,-0.2705465,-0.5078832,-0.4991689,-0.3263397,-0.4075817,-0.4982911,-0.3161876,-0.1874562,-0.4488518,-0.4494546,-0.4801514,-0.4368324,-0.4981695,-0.4983954,-0.3635534,-0.05852057,-0.5305827,-0.5944285,-0.55133,-0.5303684,-0.5643594,-0.3729344,-0.1537405,-0.474569,-0.2736059,-0.5147337,-0.4970749,-0.4100706,-0.4438053,-0.5287869,-0.2818165,-0.2309272,-0.4583297,-0.4564446,-0.4201008,-0.4196234,-0.4841558,-0.4813839,-0.3686614,-0.05852057,-0.547424,-0.5268498,-0.5121648,-0.5813993,-0.5947885,-0.3943206,-0.2090732,-0.5568337,-0.5343287
50%,0.09041743,-0.1186965,0.2365146,-0.07757109,-0.3606538,-0.2422941,-0.3787456,-0.3710526,-0.2821193,-0.3102214,-0.2822394,-0.3120581,-0.1137622,-0.3218759,-0.3190704,-0.3315766,-0.3586884,-0.3107254,-0.317125,-0.3262465,-0.05852057,-0.3355083,-0.4019946,-0.4793437,-0.325057,-0.3219507,-0.2297941,-0.1537405,-0.3543644,-0.2478658,-0.3727748,-0.3594719,-0.3401665,-0.3483807,-0.2952931,-0.2776633,-0.1397891,-0.3311077,-0.3279657,-0.3402221,-0.3290727,-0.2951803,-0.2980016,-0.3374892,-0.05852057,-0.3479007,-0.3546513,-0.483191,-0.3271433,-0.3533595,-0.2203929,-0.2090732,-0.3911213,-0.3668876
75%,0.8422765,0.8246075,0.6858317,0.3983273,-0.006536816,-0.1482874,0.02992775,0.08789668,-0.08062681,-0.08620512,0.1365564,-0.1374748,0.07315292,-0.02510163,-0.02316566,-0.002426746,-0.04014736,0.09173111,0.1031221,-0.146446,-0.05852057,0.05238717,0.1643776,0.1196283,0.04059314,0.1451187,0.003508015,-0.1384191,0.01310153,-0.1485133,0.04466049,0.07314394,-0.1147159,-0.07528943,0.09564364,-0.144733,0.05709732,-0.04392844,-0.04131026,0.01175385,-0.1025736,0.0804741,0.08695521,-0.1020681,-0.05852057,0.02544773,0.07429374,0.0196436,0.05678339,0.1426672,0.008927612,-0.1874007,0.2435886,0.2365635
max,1.726331,1.905477,1.356919,4.298167,6.770326,12.95576,6.213733,7.234799,11.07935,7.7225,9.709574,11.00148,6.413905,6.258191,6.308663,9.219824,7.664102,8.467659,9.254523,10.42931,17.08801,6.069865,5.418513,5.684066,6.437717,8.20521,10.5281,16.11978,6.622979,12.58633,6.604293,7.608439,7.242277,7.380049,8.997643,11.23013,9.021124,6.260256,6.295857,10.82918,6.862272,9.324462,10.09408,9.629758,17.08801,6.13746,9.521322,5.45512,6.150873,6.551831,11.09178,14.43104,6.612573,7.066834


기존 데이터에서는 변수별로 서로 다른 평균과 표준 편차 값을 가지고 있었습니다.<br>
Standard 스케일링된 데이터를 살펴보면, 평균이 0 표준편차가 1이 되었음을 확인할 수 있습니다.

# Imputation

### 평균으로 imputation

In [None]:
data.head()

Unnamed: 0,inst_id,sido,sgg,openDate,bedCount,instkind,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2,ownerChange
0,1,choongnam,73,20071228,175.0,nursing_hospital,4217530000.0,0.0,3961135000.0,2033835000.0,15652441.0,15236240.0,13236240.0,31641798.0,225169678.0,1012700000.0,997671900.0,470055700.0,15027810.0,2514586000.0,2360684000.0,143449600.0,0.0,682826000.0,201323700.0,0.0,481502300.0,351000000.0,2844460000.0,1496394000.0,4297848000.0,0.0,4057422000.0,2063787000.0,16194675.0,29983350.0,15683050.0,18808074.0,207829685.0,830169500.0,816570500.0,523702600.0,13598970.0,2548115000.0,2386263000.0,145898600.0,0.0,758993700.0,222876900.0,0.0,536116900.0,390000000.0,2619290000.0,1271224000.0,62.0,64.0,same
1,3,gyeongnam,32,19970401,410.0,general_hospital,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,801.0,813.0,same
2,4,gyeonggi,89,20161228,468.0,nursing_hospital,1004522000.0,515483669.0,447219700.0,296402300.0,76156.0,30000.0,0.0,0.0,41864754.0,272442100.0,253682200.0,8095950.0,18759970.0,120481000.0,120481000.0,0.0,0.0,92414340.0,92414340.0,0.0,0.0,0.0,300508800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,1.0,same
3,7,incheon,141,20000814,353.0,general_hospital,72507340000.0,0.0,70677400000.0,31786050000.0,506223059.0,1259568000.0,1196881000.0,173769780.0,902830288.0,13041540000.0,11534750000.0,0.0,1506793000.0,43179360000.0,38320780000.0,3945208000.0,0.0,42363800000.0,27991880000.0,19885200000.0,14371920000.0,7253040000.0,13857100000.0,8643659000.0,66858340000.0,0.0,64924190000.0,29711350000.0,476807804.0,1353672000.0,1277422000.0,218891720.0,838387466.0,11125720000.0,9890540000.0,0.0,1235181000.0,39583560000.0,34855760000.0,3915906000.0,0.0,37755010000.0,17018600000.0,9219427000.0,20736410000.0,15100000000.0,12954270000.0,7740829000.0,663.0,663.0,same
4,9,gyeongnam,32,20050901,196.0,general_hospital,49043540000.0,0.0,47656050000.0,24460780000.0,112352259.0,1419089000.0,1307249000.0,0.0,80749696.0,6317084000.0,5873265000.0,4099320000.0,443818600.0,43667330000.0,43306130000.0,222340000.0,0.0,49898110000.0,28909190000.0,17930380000.0,20988920000.0,13500000000.0,86311640.0,9025550000.0,48082800000.0,0.0,47125800000.0,23460040000.0,597748128.0,1522108000.0,1349851000.0,0.0,32642585.0,4906776000.0,4464017000.0,3365227000.0,442759100.0,46531380000.0,45629450000.0,789340700.0,0.0,51432590000.0,30072590000.0,17593750000.0,21360010000.0,14108030000.0,5561941.0,9025550000.0,206.0,197.0,same


In [None]:
pd.isna(data) # 데이터에 결측치 여부 확인

Unnamed: 0,inst_id,sido,sgg,openDate,bedCount,instkind,revenue1,salescost1,sga1,salary1,noi1,noe1,interest1,ctax1,profit1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset1,nonCAsset1,tanAsset1,OnonCAsset1,receivableL1,debt1,liquidLiabilities1,shortLoan1,NCLiabilities1,longLoan1,netAsset1,surplus1,revenue2,salescost2,sga2,salary2,noi2,noe2,interest2,ctax2,profit2,liquidAsset2,quickAsset2,receivableS2,inventoryAsset2,nonCAsset2,tanAsset2,OnonCAsset2,receivableL2,debt2,liquidLiabilities2,shortLoan2,NCLiabilities2,longLoan2,netAsset2,surplus2,employee1,employee2,ownerChange
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
297,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
298,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True
299,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True


In [None]:
pd.isna(data).sum() # 결측치 개수 확인

inst_id                0
sido                   0
sgg                    0
openDate               0
bedCount               5
instkind               1
revenue1               8
salescost1             8
sga1                   8
salary1                8
noi1                   8
noe1                   8
interest1              8
ctax1                  8
profit1                8
liquidAsset1           8
quickAsset1            8
receivableS1           8
inventoryAsset1        8
nonCAsset1             8
tanAsset1              8
OnonCAsset1            8
receivableL1           8
debt1                  8
liquidLiabilities1     8
shortLoan1             8
NCLiabilities1         8
longLoan1              8
netAsset1              8
surplus1               8
revenue2               8
salescost2             8
sga2                   8
salary2                8
noi2                   8
noe2                   8
interest2              8
ctax2                  8
profit2                8
liquidAsset2           8


In [None]:
pd.isna(data).sum().sum() # 총 결측치 개수 확인

425

In [None]:
mean_df = data.copy() # 데이터 복사

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean') # 평균값으로 대치하기 위한 객체 생성
imputer.fit(mean_df[num_columns]) # 대체 수행
mean_df[num_columns] = imputer.transform(mean_df[num_columns]) # 원래 형태로 변환

In [None]:
pd.isna(mean_df[num_columns]).sum() # 결측치 확인

inst_id               0
sgg                   0
openDate              0
bedCount              0
revenue1              0
salescost1            0
sga1                  0
salary1               0
noi1                  0
noe1                  0
interest1             0
ctax1                 0
profit1               0
liquidAsset1          0
quickAsset1           0
receivableS1          0
inventoryAsset1       0
nonCAsset1            0
tanAsset1             0
OnonCAsset1           0
receivableL1          0
debt1                 0
liquidLiabilities1    0
shortLoan1            0
NCLiabilities1        0
longLoan1             0
netAsset1             0
surplus1              0
revenue2              0
salescost2            0
sga2                  0
salary2               0
noi2                  0
noe2                  0
interest2             0
ctax2                 0
profit2               0
liquidAsset2          0
quickAsset2           0
receivableS2          0
inventoryAsset2       0
nonCAsset2      

### 중위수로 imputation

In [None]:
median_df = data.copy()

In [None]:
imputer = SimpleImputer(strategy='median') # 중위수로 대치하기 위한 객체 생성
imputer.fit(median_df[num_columns]) # 대체 수행
median_df[num_columns] = imputer.transform(median_df[num_columns]) # 변환
# median_df[num_columns].fit_transform(median_df[num_columns]) # 대체와 변환을 한 번에 수행

In [None]:
pd.isna(median_df[num_columns]).sum()

inst_id               0
sgg                   0
openDate              0
bedCount              0
revenue1              0
salescost1            0
sga1                  0
salary1               0
noi1                  0
noe1                  0
interest1             0
ctax1                 0
profit1               0
liquidAsset1          0
quickAsset1           0
receivableS1          0
inventoryAsset1       0
nonCAsset1            0
tanAsset1             0
OnonCAsset1           0
receivableL1          0
debt1                 0
liquidLiabilities1    0
shortLoan1            0
NCLiabilities1        0
longLoan1             0
netAsset1             0
surplus1              0
revenue2              0
salescost2            0
sga2                  0
salary2               0
noi2                  0
noe2                  0
interest2             0
ctax2                 0
profit2               0
liquidAsset2          0
quickAsset2           0
receivableS2          0
inventoryAsset2       0
nonCAsset2      

### 3. Iterative Impute (R 언어의 MICE 패키지)
Round robin 방식으로 반복하여 결측값을 회귀하는 방식으로 결측치를 처리합니다. <br>
결측값을 회귀하는 방식으로 처리하기 때문에 수치형 변수에만 적용할 수 있습니다.

1. 각 결측치를 해당 변수의 평균으로 채워넣는다. 
2. 대체할 변수의 결측치는 제외한 상태로 해당 변수의 결측치를 회귀모델을 이용하여 예측한다.
3. 다른 변수에서도 해당 방식을 반복한다.
4. 모든 변수에 대해 반복 후 해당 이터레이션에서 맨 처음에 할당했던 값과의 차이를 계산한다.
5. 해당 값의 차이가 0이 될 때(수렴)까지 반복한다.

MICE 알고리즘으로 결측치를 처리하는 IterativeImputer는 Scikit-Learn에서 impute 패키지에 있습니다.

#### ref
- [Scikit-Learn, Iterative Imputer (MICE)](https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html?highlight=mice)
- [MICE 알고리즘 설명](https://ichi.pro/ko/deiteo-seteueseo-gyeol-cheuggabs-eul-daechihaneun-mice-algolijeum-217004654686142)

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
impute_df = data.copy()

In [None]:
imp_mean = IterativeImputer(random_state=0)
impute_df[num_columns] = imp_mean.fit_transform(impute_df[num_columns])

In [None]:
pd.isna(impute_df[num_columns]).sum()

inst_id               0
sgg                   0
openDate              0
bedCount              0
revenue1              0
salescost1            0
sga1                  0
salary1               0
noi1                  0
noe1                  0
interest1             0
ctax1                 0
profit1               0
liquidAsset1          0
quickAsset1           0
receivableS1          0
inventoryAsset1       0
nonCAsset1            0
tanAsset1             0
OnonCAsset1           0
receivableL1          0
debt1                 0
liquidLiabilities1    0
shortLoan1            0
NCLiabilities1        0
longLoan1             0
netAsset1             0
surplus1              0
revenue2              0
salescost2            0
sga2                  0
salary2               0
noi2                  0
noe2                  0
interest2             0
ctax2                 0
profit2               0
liquidAsset2          0
quickAsset2           0
receivableS2          0
inventoryAsset2       0
nonCAsset2      

### 4. 최빈값으로 imputation

In [None]:
# data.describe(include='all') # 수치형 뿐만 아니라 모든 데이터에 대한 통계량을 확인

In [None]:
mode_df = data.copy()

In [None]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(mode_df[cat_columns])
mode_df[cat_columns] = imputer.transform(mode_df[cat_columns])

In [None]:
pd.isna(mode_df[cat_columns]).sum()

sido           0
instkind       0
ownerChange    0
dtype: int64

# 라벨 인코딩

In [None]:
data = pd.read_csv(example_file)
label = pd.DataFrame(data['OC'])

label.head()

Unnamed: 0,OC
0,open
1,open
2,open
3,open
4,open


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
le.fit(label)

LabelEncoder()

In [None]:
# classes_ 속성에 있는 순서(index)대로 라벨 번호가 부여됨
le.classes_

array([' close', 'open'], dtype=object)

In [None]:
label_encoded = le.transform(label) + 1

In [None]:
le_df = pd.DataFrame(label_encoded, columns = ['label_encoded'])

result = pd.concat([label, le_df], axis=1)
result.sort_values('label_encoded', inplace=True)

result.head(20)

Unnamed: 0,OC,label_encoded
261,close,1
177,close,1
93,close,1
212,close,1
214,close,1
141,close,1
71,close,1
290,close,1
35,close,1
158,close,1


## 원핫 인코딩

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)

In [None]:
ohe.fit(label)

OneHotEncoder(sparse=False)

In [None]:
one_hot_encoded = ohe.transform(label)

In [None]:
ohe_df = pd.DataFrame(one_hot_encoded, columns = ohe.categories_[0])
result = pd.concat([label, ohe_df], axis=1)

result.head(10)

Unnamed: 0,OC,close,open
0,open,0.0,1.0
1,open,0.0,1.0
2,open,0.0,1.0
3,open,0.0,1.0
4,open,0.0,1.0
5,open,0.0,1.0
6,open,0.0,1.0
7,open,0.0,1.0
8,open,0.0,1.0
9,open,0.0,1.0
