In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import RobustScaler
from sklearn.experimental import enable_iterative_imputer
import missingno as msno
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

warnings.filterwarnings('ignore')

def load_data(path:str):
    data=pd.read_csv(path)
    
    #모델에 불필요한 ID, Year 제거
    data.drop(['ID','year'],axis=1,inplace=True)
    
    #target변수 분리
    target=data['Status']
    data.drop(columns=['Status'],axis=1,inplace=True)
    
    #범주형 데이터 전처리
    category_data = (data.loc[:,data.dtypes == 'object'])
    
    #결측치 확인
    fig = msno.matrix(data)
    fig_copy = fig.get_figure()
    fig_copy.savefig('../graph/msno_before.png')
    
    #범주형 데이터는 결측치가 많지 않으므로 결측치 삭제
    category_na_idx = category_data[category_data.isna().sum(axis=1)!=0].index.values
    data.drop(category_na_idx,inplace=True)

    #분리해놓은 target에서도 index drop
    target.drop(category_na_idx,inplace=True)
    
    data.drop(columns=category_data.columns,axis=1,inplace=True)
    
    #one-hot encoding
    for col in category_data.columns:
        if(col=='age'):
            continue
        category_data = pd.get_dummies(data=category_data,columns=[col])
    
    #label encoding
    replace_age = {"<25" : 1,"25-34" : 2,"35-44" : 3,"45-54":4,"55-64":5,
                 "65-74":6,">74":7}
    category_data['age'] = category_data['age'].replace(replace_age)
    
    data[category_data.columns]=category_data
    
    #scikit-learn의 IterativeImputer 이용해서 mice algorithm 구현
    ir_reg = LinearRegression()
    imputer = IterativeImputer(estimator=ir_reg,verbose=2,max_iter=20,tol=0.001,imputation_order='roman')
    
    data = pd.DataFrame(imputer.fit_transform(data),columns=data.columns)
    
    fig = msno.matrix(data)
    fig_copy = fig.get_figure()
    fig_copy.savefig('../graph/msno_after.png')
    
    #Scale이 큰 변수 Robust Scaling 진행
    big_scale_column=['loan_amount','Upfront_charges','property_value','income','LTV']
    robust_scaler= RobustScaler()
    data[big_scale_column]=pd.DataFrame(robust_scaler.fit_transform(data[big_scale_column]),columns=big_scale_column)
    
    #인덱스 초기화
    data.reset_index(drop=True,inplace=True)
    target.reset_index(drop=True,inplace=True)
    
    target=target.astype('int')
    
    return data, target