In [70]:
import pandas as pd
import numpy as np

In [66]:
df = pd.read_csv('../feature_set/산업위험_IR.CSV', encoding='cp949', index_col=0)

In [67]:
# To remove "IR-" from the df while preserving the trailing hyphen (e.g., "IR-AA-" to "AA-"), 
# we need to implement a conditional removal.

def remove_ir_prefix(value):
    # Check if the value is a string and starts with 'IR-'
    if isinstance(value, str) and value.startswith('IR-'):
        # Remove 'IR-' prefix
        return value[3:]
    else:
        # Return the original value if it doesn't start with 'IR-'
        return value

# Apply the function to each cell in the dataframe, except for the first column
for column in df.columns[:]:  # Skip the first column
    df[column] = df[column].apply(remove_ir_prefix)

df.head()

Unnamed: 0_level_0,2012년,2013년,2014년,2015년,2016년,2017년,2018년,2019년,2020년,2021년,2022년
구분,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
은행,AA+,AA+,AA+,AA+,AA,AA,AA,AA,AA,AA,
전력,AA,AA,AA,AA,AA-,AA-,AA-,AA-,AA-,AA-,
도시가스,A+,A+,A+,A+,A+,A+,A+,A+,A+,A+,
손해보험,A+,A+,A+,A+,A+,A+,A+,A+,A+,A+,
생명보험,A+,A+,A+,A+,A+,A+,A+,A+,A+,A+,


In [68]:
# Custom function to assign numerical values to ratings based on the described scheme
def rating_to_numeric(rating):
    if pd.isna(rating):
        return None
    rating_map = {'AA+': 0.5,'AA': 1, 'AA-': 1.5, 'A+': 2,'A': 2.5, 'A-': 3, 
                'BBB+': 3.5,'BBB': 4, 'BBB-': 4.5, 'BB+': 5, 'BB': 5.5, 'BB-': 6,
                'B+': 6.5, 'B': 7, 'B-': 7.5,}
    numeric_rating = 0
    # Check for the presence of 'AA', 'A', 'BBB', 'BB', or 'B' in the rating and assign the base numeric value
    for key in rating_map:
        if key in rating:
            numeric_rating = rating_map[key]
            break

    return numeric_rating

# Apply the custom function to each rating column in the DataFrame
for column in df.columns[:]:  # Exclude the first column '구분'
    df[column] = df[column].apply(rating_to_numeric)

df.head()

Unnamed: 0_level_0,2012년,2013년,2014년,2015년,2016년,2017년,2018년,2019년,2020년,2021년,2022년
구분,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
은행,0.5,0.5,0.5,0.5,1.0,1.0,1.0,1.0,1.0,1.0,
전력,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
도시가스,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,
손해보험,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,
생명보험,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,


In [71]:
for index, row in df.iterrows():
    # 현재 행에서 가장 많이 나타나는 값(최빈값)을 찾습니다.
    # nan값은 무시합니다.
    mode_value = row.mode(dropna=True)
    if not mode_value.empty:
        # 최빈값으로 결측치를 채웁니다.
        df.loc[index] = row.fillna(mode_value[0])

In [73]:
df.to_csv('../feature_set/산업위험_IR_인코딩.CSV')