In [2]:
import pandas as pd
import numpy as np

- 보험 청구 금액 예측 인공지능 개발

In [4]:
df = pd.read_csv('data/healthcare.csv', index_col=0)
df

Unnamed: 0,Age,Gender,BMI,Region,Smoker,NumVisits,InsuranceClaim
0,51.0,Female,,South,No,19.0,70.081987
1,92.0,Female,38.074006,West,Yes,13.0,92.747518
2,14.0,Male,,North,No,8.0,46.794138
3,,Female,27.020924,West,No,7.0,44.789132
4,60.0,Male,37.961368,North,No,16.0,71.790344
...,...,...,...,...,...,...,...
495,95.0,Male,29.831966,East,No,20.0,79.352997
496,47.0,Female,31.257355,West,No,13.0,64.364345
497,88.0,Female,26.981973,South,Yes,11.0,74.631320
498,0.0,Male,30.393551,North,Yes,20.0,75.066459


- 타겟 변수와 피쳐 분리

In [6]:
y = df['InsuranceClaim']
y

0      70.081987
1      92.747518
2      46.794138
3      44.789132
4      71.790344
         ...    
495    79.352997
496    64.364345
497    74.631320
498    75.066459
499    45.184790
Name: InsuranceClaim, Length: 500, dtype: float64

In [8]:
X = df.drop(columns=['InsuranceClaim'])
X

Unnamed: 0,Age,Gender,BMI,Region,Smoker,NumVisits
0,51.0,Female,,South,No,19.0
1,92.0,Female,38.074006,West,Yes,13.0
2,14.0,Male,,North,No,8.0
3,,Female,27.020924,West,No,7.0
4,60.0,Male,37.961368,North,No,16.0
...,...,...,...,...,...,...
495,95.0,Male,29.831966,East,No,20.0
496,47.0,Female,31.257355,West,No,13.0
497,88.0,Female,26.981973,South,Yes,11.0
498,0.0,Male,30.393551,North,Yes,20.0


In [9]:
X.isna().sum()

Age          30
Gender        0
BMI          50
Region        0
Smoker        0
NumVisits    20
dtype: int64

- 카테고리컬 데이터와 수치형 데이터 처리

- 컬럼 이름 분리 저장

In [10]:
df.columns

Index(['Age', 'Gender', 'BMI', 'Region', 'Smoker', 'NumVisits',
       'InsuranceClaim'],
      dtype='object')

In [11]:
numerical = ['Age', 'BMI', 'NumVisits']

In [12]:
categorical = ['Gender', 'Region', 'Smoker']

- 수치형 데이터의 NaN을 각 컬럼 평균값으로 치환 -> pipeline 라이브러리 사용 시, fillna() 대신 SimpleImputer 사용

- 카테고리컬 데이터는 레이블인코딩/원핫인코딩 이용

In [14]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()

In [35]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [36]:
ordinal_encoder = OrdinalEncoder()
onehot_encoder = OneHotEncoder()

In [37]:
from sklearn.compose import ColumnTransformer

In [38]:
df['Smoker'].nunique()

2

In [39]:
ordinal_columns = ['Gender', 'Smoker']

In [40]:
onehot_columns = ['Region']

In [41]:
numeric_columns = ['Age', 'BMI', 'NumVisits']

- 전처리 라이브러리 준비, 적용할 컬럼리스트도 준비

In [42]:
preprocessor = ColumnTransformer([('num_encoder', imputer, numeric_columns), ('ordinal_encoder', ordinal_encoder, ordinal_columns),
                                  ('onehot_encoder', onehot_encoder, onehot_columns)])

- 파이프라인 생성

In [43]:
from sklearn.ensemble import RandomForestRegressor

In [44]:
regressor = RandomForestRegressor(n_estimators=100, random_state=42)

In [45]:
from sklearn.pipeline import Pipeline

In [46]:
pipeline = Pipeline(steps=[('preprocessing', preprocessor), ('modeling', regressor)])

- 학습/테스트 데이터 분리 후 학습

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
pipeline.fit(X_train, y_train)

In [50]:
y_pred = pipeline.predict(X_test)

In [51]:
from sklearn.metrics import mean_squared_error, r2_score

In [53]:
mean_squared_error(y_test, y_pred)

46.980189783960185

In [54]:
r2_score(y_test, y_pred)

0.7521137711976167

#### 파이프라인 사용 이점
    - 저장해두기만 하면 서비스 배포할 때 이 파일 하나로 전처리 완료
    - 예측 데이터 넣기만 하면 알아서 내부 전처리 완료

In [55]:
import joblib

In [56]:
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']