### 재무상태표, 손익계산서, 현금흐름표 피처 만들기
- 1차
  - 부채비율 유동비율 자기자본비율 고정자산비율 고정비율 순운전자본비율 ln자산총계 총자산이익잉여금비율 ln매출액 매출액순이익률
- 2차(33개)
  - 부채비율, 유동비율, 자기자본비율, 고정자산비율, 고정비율, 순운전자본비율, 현금비율, 현금흐름부채비율, 완전자본잠식여부, 자본잠식여부, 자산총계, 총자산증가율, 유동자산증가율, 매출액증가율, 순이익증가율, 영업이익증가율, 현금자산비율, 자산대비영업현금흐름, ROE, ROA, 총자산영업이익율, 이익잉여금비율, 매출총이익률, 총자산이익잉여금비율, ln자산총계, ln매출액, 총자산회전율, 비유동자산회전율, 매출원가율, 판관비율, 재무활동의존도, 현금보유율

In [70]:
import pandas as pd

df = pd.read_csv("preprocessed_data.csv", dtype={'stock_code':str, 'year':str})

In [71]:
df['stock_code'] = df['stock_code'].str.strip()
df['corp_nm'] = df['corp_nm'].str.strip()

df.drop(['기말현금및현금성자산', '기초현금및현금성자산'], axis=1, inplace=True)

df['총현금흐름'] = (df['영업활동현금흐름'] + 
                    df['투자활동현금흐름'] + 
                    df['재무활동현금흐름'])

In [72]:
# 수치형 컬럼 추출
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('is_defaulted').tolist()

# 회사명, 거래소코드, 회계년도 기준 정렬
df = df.sort_values(['corp_nm', 'stock_code', 'year'])

# _당기, _전기 컬럼 생성
for col in num_cols:
    df[f'{col}_당기'] = df[col]
    df[f'{col}_전기'] = df.groupby(['corp_nm', 'stock_code'])[col].shift(1)

# 2014년 데이터 제거
df = df[df['year'].astype(int) > 2014].copy()

# 원래 수치형 컬럼 제거
df = df.drop(columns=num_cols)

df.columns

Index(['corp_nm', 'stock_code', 'year', 'is_defaulted', '자산총계_당기', '자산총계_전기',
       '비유동자산_당기', '비유동자산_전기', '유동자산_당기', '유동자산_전기', '자본총계_당기', '자본총계_전기',
       '자본금_당기', '자본금_전기', '이익잉여금_당기', '이익잉여금_전기', '부채총계_당기', '부채총계_전기',
       '비유동부채_당기', '비유동부채_전기', '유동부채_당기', '유동부채_전기', '현금및현금성자산_당기',
       '현금및현금성자산_전기', '영업활동현금흐름_당기', '영업활동현금흐름_전기', '투자활동현금흐름_당기',
       '투자활동현금흐름_전기', '재무활동현금흐름_당기', '재무활동현금흐름_전기', '매출액_당기', '매출액_전기',
       '매출총이익_당기', '매출총이익_전기', '판매관리비_당기', '판매관리비_전기', '영업이익_당기', '영업이익_전기',
       '매출원가_당기', '매출원가_전기', '당기순이익_당기', '당기순이익_전기', '재고자산_당기', '재고자산_전기',
       '매출채권_당기', '매출채권_전기', '매출채권및기타유동채권_당기', '매출채권및기타유동채권_전기', '총현금흐름_당기',
       '총현금흐름_전기'],
      dtype='object')

In [73]:
(df == 0).sum()

corp_nm               0
stock_code            0
year                  0
is_defaulted      19817
자산총계_당기               0
자산총계_전기               0
비유동자산_당기              0
비유동자산_전기              0
유동자산_당기               0
유동자산_전기               0
자본총계_당기               1
자본총계_전기               1
자본금_당기                0
자본금_전기                0
이익잉여금_당기              7
이익잉여금_전기              7
부채총계_당기               0
부채총계_전기               0
비유동부채_당기            240
비유동부채_전기            255
유동부채_당기               0
유동부채_전기               0
현금및현금성자산_당기           0
현금및현금성자산_전기           0
영업활동현금흐름_당기           2
영업활동현금흐름_전기           3
투자활동현금흐름_당기           8
투자활동현금흐름_전기           9
재무활동현금흐름_당기         268
재무활동현금흐름_전기         315
매출액_당기                0
매출액_전기                0
매출총이익_당기             74
매출총이익_전기             64
판매관리비_당기              0
판매관리비_전기              0
영업이익_당기               2
영업이익_전기               3
매출원가_당기            1776
매출원가_전기            1631
당기순이익_당기              2
당기순이익_전기        

In [74]:
# df[''] = df[''] / df['']
# 건전성

df['부채비율'] = (df['부채총계_당기'] / df['자본총계_당기'])
df['유동비율'] = (df['유동자산_당기'] / df['유동부채_당기'])
df['자기자본비율'] = (df['자본총계_당기'] / df['자산총계_당기'])
df['고정자산비율'] = (df['비유동자산_당기'] / df['자산총계_당기'])
df['고정비율'] = (df['비유동자산_당기'] / df['자본총계_당기'])
df['순운전자본비율'] = ((df['유동자산_당기'] - df['유동부채_당기']) / df['자산총계_당기'])

df['현금비율'] = df['현금및현금성자산_전기'] / df['유동부채_당기']
df['현금흐름부채비율'] = df['영업활동현금흐름_당기'] / df['부채총계_당기']

# df['완전자본잠식여부'] = (df['자본금_당기'] < 0).astype(int)
df['자본잠식여부'] = (df['자본총계_당기'] < 0).astype(int)

In [75]:
# 성장성

df['총자산증가율'] = (df['자산총계_당기'] - df['자산총계_전기']) / df['자산총계_전기']
df['유동자산증가율'] = (df['유동자산_당기'] - df['유동자산_전기']) / df['유동자산_전기']

df['매출액증가율'] = (df['매출액_당기'] - df['매출액_전기']) / df['매출액_전기']
df['순이익증가율'] = (df['당기순이익_당기'] - df['당기순이익_전기']) / df['당기순이익_전기']
df['영업이익증가율'] = (df['영업이익_당기'] - df['영업이익_전기']) / df['영업이익_전기']


In [76]:
# 유동성

df['현금자산비율'] = df['현금및현금성자산_당기'] / df['자산총계_당기']
df['자산대비영업현금흐름'] = df['영업활동현금흐름_당기'] / df['자산총계_당기']

In [77]:
import numpy as np

# 규모
df['ln자산총계'] = np.log(df['자산총계_당기'])
df['ln매출액'] = np.log(df['매출액_당기'])

In [78]:
# 수익성
df['ROE'] = df['당기순이익_당기'] / df['자본총계_당기']
df['ROA'] = df['당기순이익_당기'] / df['자산총계_당기']
df['총자산영업이익율'] = df['영업이익_당기'] / df['자산총계_당기']
df['이익잉여금비율'] = df['이익잉여금_당기'] / df['자산총계_당기']
df['매출총이익률'] = df['매출총이익_당기'] / df['매출액_당기']

In [79]:
# 활동성

df['총자산회전율'] = df['매출액_당기'] / df['자산총계_당기']
df['비유동자산회전율'] = df['매출액_당기'] / df['비유동자산_당기'] # 유형자산 대신
df['매출원가율'] = df['매출원가_당기'] / df['매출액_당기']
df['판관비율'] = df['판매관리비_당기'] / df['매출액_당기']
df['매출채권회전율'] = df['매출액_당기'] / df['매출채권_당기']
df['재고자산회전율'] = df['매출원가_당기'] / df['재고자산_당기']


In [80]:
# 현금흐름
df['재무활동의존도'] = df['재무활동현금흐름_당기'].abs() / df['총현금흐름_당기']
df['현금보유율'] = df['현금및현금성자산_당기'] / df['자산총계_당기']


In [81]:
df.columns

Index(['corp_nm', 'stock_code', 'year', 'is_defaulted', '자산총계_당기', '자산총계_전기',
       '비유동자산_당기', '비유동자산_전기', '유동자산_당기', '유동자산_전기', '자본총계_당기', '자본총계_전기',
       '자본금_당기', '자본금_전기', '이익잉여금_당기', '이익잉여금_전기', '부채총계_당기', '부채총계_전기',
       '비유동부채_당기', '비유동부채_전기', '유동부채_당기', '유동부채_전기', '현금및현금성자산_당기',
       '현금및현금성자산_전기', '영업활동현금흐름_당기', '영업활동현금흐름_전기', '투자활동현금흐름_당기',
       '투자활동현금흐름_전기', '재무활동현금흐름_당기', '재무활동현금흐름_전기', '매출액_당기', '매출액_전기',
       '매출총이익_당기', '매출총이익_전기', '판매관리비_당기', '판매관리비_전기', '영업이익_당기', '영업이익_전기',
       '매출원가_당기', '매출원가_전기', '당기순이익_당기', '당기순이익_전기', '재고자산_당기', '재고자산_전기',
       '매출채권_당기', '매출채권_전기', '매출채권및기타유동채권_당기', '매출채권및기타유동채권_전기', '총현금흐름_당기',
       '총현금흐름_전기', '부채비율', '유동비율', '자기자본비율', '고정자산비율', '고정비율', '순운전자본비율',
       '현금비율', '현금흐름부채비율', '자본잠식여부', '총자산증가율', '유동자산증가율', '매출액증가율', '순이익증가율',
       '영업이익증가율', '현금자산비율', '자산대비영업현금흐름', 'ln자산총계', 'ln매출액', 'ROE', 'ROA',
       '총자산영업이익율', '이익잉여금비율', '매출총이익률', '총자산회전율', '비유동자산회전율', '매출원가율', '판관비율',
       '매출채권회전율', '재고자산회전율', '재무

In [82]:
new_df = df[['corp_nm', 'stock_code', 'year','부채비율', '유동비율', '자기자본비율',
       '고정자산비율', '고정비율', '순운전자본비율', '현금비율', '현금흐름부채비율', '자본잠식여부',
       '총자산증가율', '유동자산증가율', '매출액증가율', '순이익증가율', '영업이익증가율', '현금자산비율',
       '자산대비영업현금흐름', 'ln자산총계', 'ln매출액', 'ROE', 'ROA', '총자산영업이익율', '이익잉여금비율',
       '매출총이익률', '총자산회전율', '비유동자산회전율', '매출원가율', '판관비율', '매출채권회전율', '재고자산회전율',
       '재무활동의존도', '현금보유율','is_defaulted']]
new_df

Unnamed: 0,corp_nm,stock_code,year,부채비율,유동비율,자기자본비율,고정자산비율,고정비율,순운전자본비율,현금비율,...,매출총이익률,총자산회전율,비유동자산회전율,매출원가율,판관비율,매출채권회전율,재고자산회전율,재무활동의존도,현금보유율,is_defaulted
1,(주)CMG제약,058820,2015,0.207355,2.977472,0.828257,0.534726,0.645604,0.309009,0.998145,...,0.362552,0.477840,0.893616,0.637448,0.457622,1.707400,3.159645,-0.000000,0.045345,0
2,(주)CMG제약,058820,2016,0.097641,8.621464,0.911045,0.290212,0.318548,0.627460,0.280862,...,0.403594,0.297079,1.023664,0.596406,0.351099,2.283892,3.075405,-126.704161,0.019360,0
3,(주)CMG제약,058820,2017,0.118681,7.462783,0.893910,0.258659,0.289357,0.642002,0.197396,...,0.434279,0.379084,1.465575,0.565721,0.396754,2.476979,2.972674,0.103611,0.034673,0
4,(주)CMG제약,058820,2018,0.099999,9.578557,0.909092,0.288595,0.317454,0.637134,0.447202,...,0.456032,0.435958,1.510621,0.543968,0.404536,2.620212,3.164139,0.000000,0.117717,0
5,(주)CMG제약,058820,2019,0.067079,16.180723,0.937138,0.216700,0.231236,0.734891,1.446795,...,0.493369,0.304940,1.407200,0.506631,0.448953,2.704433,3.282160,14.273772,0.096647,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21903,흥아해운(주),003280,2019,20.922971,0.343592,0.045614,0.841721,18.453027,-0.302380,0.053690,...,-0.014127,0.251912,0.299282,1.014127,0.137409,36.424001,13.088536,-52.019649,0.019556,0
21904,흥아해운(주),003280,2020,-15.486733,0.351135,-0.069143,0.819939,-11.858597,-0.332736,0.051766,...,0.058293,0.300088,0.365989,0.941707,0.107766,31.290646,10.654041,-11.105439,0.011683,0
21905,흥아해운(주),003280,2021,1.293082,2.002908,0.436094,0.783560,1.796767,0.108377,0.124145,...,0.065305,0.329547,0.420577,0.934695,0.087965,17.356375,11.194468,0.695282,0.147195,0
21906,흥아해운(주),003280,2022,1.288613,1.490015,0.436946,0.743217,1.700937,0.084447,0.722910,...,0.214225,0.498395,0.670591,0.785775,0.042568,26.562507,12.438512,1.941652,0.154061,0


In [83]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20024 entries, 1 to 21907
Data columns (total 35 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   corp_nm       20024 non-null  object 
 1   stock_code    20024 non-null  object 
 2   year          20024 non-null  object 
 3   부채비율          20024 non-null  float64
 4   유동비율          20024 non-null  float64
 5   자기자본비율        20024 non-null  float64
 6   고정자산비율        20024 non-null  float64
 7   고정비율          20024 non-null  float64
 8   순운전자본비율       20024 non-null  float64
 9   현금비율          19223 non-null  float64
 10  현금흐름부채비율      20024 non-null  float64
 11  자본잠식여부        20024 non-null  int64  
 12  총자산증가율        19223 non-null  float64
 13  유동자산증가율       19223 non-null  float64
 14  매출액증가율        19223 non-null  float64
 15  순이익증가율        19223 non-null  float64
 16  영업이익증가율       19223 non-null  float64
 17  현금자산비율        20024 non-null  float64
 18  자산대비영업현금흐름    20024 non-null  f

In [84]:
new_df.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,부채비율,유동비율,자기자본비율,고정자산비율,고정비율,순운전자본비율,현금비율,현금흐름부채비율,자본잠식여부,총자산증가율,...,매출총이익률,총자산회전율,비유동자산회전율,매출원가율,판관비율,매출채권회전율,재고자산회전율,재무활동의존도,현금보유율,is_defaulted
count,20024.0,20024.0,20024.0,20024.0,20024.0,20024.0,19223.0,20024.0,20024.0,19223.0,...,20024.0,20024.0,20024.0,20024.0,20024.0,20024.0,18695.0,20022.0,20024.0,20024.0
mean,inf,3.501782,0.609976,0.530765,inf,0.175266,0.823244,0.159445,0.012385,0.123119,...,0.295104,0.727558,1.930399,0.700154,1.495781,inf,inf,20.822515,0.106876,0.010338
std,,8.173785,0.254865,0.208958,,0.32929,2.570073,0.795744,0.1106,0.442668,...,0.310408,0.523937,3.221937,0.315494,18.553864,,,3356.272428,0.117985,0.10115
min,-298.0617,0.004281,-2.627687,0.013333,-135.7841,-10.256865,4.1e-05,-27.824687,0.0,-0.934063,...,-8.3842,1.1e-05,1.1e-05,0.0,0.000133,0.01716584,0.0,-234025.8,1.2e-05,0.0
25%,0.2453,1.021963,0.471887,0.379403,0.5612709,0.006679,0.089234,-0.030934,0.0,-0.025439,...,0.113605,0.371155,0.660727,0.588492,0.087962,3.834637,3.314721,-1.169039,0.027891,0.0
50%,0.5698371,1.678302,0.628536,0.534204,0.8540744,0.175295,0.26479,0.103434,0.0,0.042468,...,0.205428,0.642826,1.254195,0.792679,0.167427,5.996781,6.678999,0.084483,0.068959,0.0
75%,1.075388,3.250966,0.795557,0.683678,1.229984,0.361532,0.711526,0.303856,0.0,0.150457,...,0.402533,0.971287,2.24534,0.886218,0.383002,9.651058,17.14028,1.663856,0.141188,0.0
max,inf,267.811427,0.999979,0.999676,inf,0.972509,104.208599,20.242568,1.0,18.052744,...,1.0,5.079178,126.504135,8.654875,1250.16055,inf,inf,297784.5,0.941499,1.0


In [85]:
# inf, nan 결측치처리
new_df = new_df.replace([np.inf, -np.inf], np.nan)
new_df = new_df.dropna()
new_df.describe()

Unnamed: 0,부채비율,유동비율,자기자본비율,고정자산비율,고정비율,순운전자본비율,현금비율,현금흐름부채비율,자본잠식여부,총자산증가율,...,매출총이익률,총자산회전율,비유동자산회전율,매출원가율,판관비율,매출채권회전율,재고자산회전율,재무활동의존도,현금보유율,is_defaulted
count,16678.0,16678.0,16678.0,16678.0,16678.0,16678.0,16678.0,16678.0,16678.0,16678.0,...,16678.0,16678.0,16678.0,16678.0,16678.0,16678.0,16678.0,16678.0,16678.0,16678.0
mean,1.011748,3.033774,0.609195,0.529469,1.086102,0.174832,0.662033,0.153665,0.007255,0.114372,...,0.241222,0.761619,1.937894,0.758349,0.402358,34.540599,434.8922,4.908094,0.099569,0.009773
std,11.107566,5.552646,0.226195,0.192951,5.187718,0.311274,1.625731,0.628841,0.08487,0.381844,...,0.23896,0.498667,3.043633,0.243606,2.368606,1255.730133,28396.2,516.907445,0.105623,0.098379
min,-298.061714,0.01519,-2.627687,0.014982,-135.784134,-10.256865,9e-05,-27.824687,0.0,-0.934063,...,-4.881162,0.000941,0.00237,0.0,0.006877,0.028479,0.0,-25606.694557,3e-05,0.0
25%,0.269687,1.019461,0.47089,0.391522,0.573917,0.006386,0.083756,-0.026886,0.0,-0.02445,...,0.107946,0.427065,0.748926,0.667699,0.083129,3.755523,3.183291,-1.212945,0.027866,0.0
50%,0.601213,1.641308,0.619736,0.537092,0.860069,0.174907,0.243894,0.103387,0.0,0.043378,...,0.187503,0.67292,1.301345,0.814267,0.148873,5.723515,6.1855,0.083249,0.066157,0.0
75%,1.09738,3.0996,0.782969,0.673699,1.255492,0.357007,0.651096,0.290477,0.0,0.147197,...,0.331183,0.991837,2.239157,0.892969,0.312499,8.767642,13.18719,1.694886,0.133091,0.0
max,1151.594881,267.811427,0.996085,0.996287,532.570389,0.972509,70.761692,8.473989,1.0,8.906192,...,1.0,5.079178,126.504135,5.881162,143.9288,115022.346187,2793515.0,50728.021505,0.903687,1.0


In [86]:
new_df.to_csv("features_v1.csv", encoding="utf-8-sig")