데이터 : data_regression.zip (수업노트에서 다운로드)
- train.csv, test.csv만 사용
- y_test.csv파일은 최종 평가용임

## 머신러닝
- 문제정의, 라이브러리/데이터 불러오기
- 탐색적 데이터 분석 (EDA)
- 데이터 전처리
- 피처엔지니어링
- (Train/Validation 나누기)
- 모델 선택/훈련/평가/최적화
- 예측
- (csv 생성)

## 문제정의, 라이브러리/데이터 불러오기
- 보험료 예측
- 평가: rmse
- csv: id와 예측 값

In [147]:
# 라이브러리 및 데이터 불러오기
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## EDA

In [148]:
# 데이터 크기 확인
train.shape, test.shape

# 컬럼의 수가 1개 차이난다

((1070, 8), (268, 7))

In [149]:
# 데이터 샘플 확인 (train)
train.head(3)

# charges -> target

Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges
0,330,61,female,36.385,1,yes,northeast,48517.56315
1,748,47,female,36.0,1,no,southwest,8556.907
2,684,33,female,18.5,1,no,southwest,4766.022


In [150]:
# 데이터 샘플 확인 (test)
test.head()

Unnamed: 0,id,age,sex,bmi,children,smoker,region
0,508,24,female,25.27,0,no,northeast
1,1309,41,male,32.2,2,no,southwest
2,766,47,male,32.3,1,no,southwest
3,667,40,female,32.775,2,yes,northwest
4,1057,45,female,31.79,0,no,southeast


In [151]:
# 결측치 확인 (train)
train.isnull().sum()

id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [152]:
# 결측치 확인 (test)
test.isnull().sum()

id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

In [153]:
# 통계값 확인
train.describe()

Unnamed: 0,id,age,bmi,children,charges
count,1070.0,1070.0,1070.0,1070.0,1070.0
mean,671.090654,39.024299,30.730495,1.090654,13193.6348
std,380.66678,13.916945,6.05326,1.204997,12211.531632
min,0.0,18.0,16.815,0.0,1121.8739
25%,347.25,27.0,26.4,0.0,4695.398638
50%,673.0,39.0,30.4,1.0,9273.6388
75%,990.25,51.0,34.8,2.0,15826.112723
max,1337.0,64.0,52.58,5.0,63770.42801


In [154]:
# 통계값 object (train)
train.describe(include=object)

Unnamed: 0,sex,smoker,region
count,1070,1070,1070
unique,2,2,4
top,male,no,southeast
freq,544,856,287


In [155]:
# 통계값 object (test)
test.describe(include=object)

Unnamed: 0,sex,smoker,region
count,268,268,268
unique,2,2,4
top,female,no,southeast
freq,136,208,77


## 데이터 전처리 및 피처 엔지니어링

In [156]:
# 데이터 샘플
train.head(1)

Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges
0,330,61,female,36.385,1,yes,northeast,48517.56315


In [157]:
# object 컬럼명 선택
cols = train.select_dtypes(include="object").columns
cols

# 원핫인코딩 -> 범주형 칼럼 사용

Index(['sex', 'smoker', 'region'], dtype='object')

In [158]:
# 범주형 데이터 -> 원핫 인코딩
train = pd.get_dummies(train, columns=cols)
test = pd.get_dummies(test, columns=cols)
display(train.head(2))
display(test.head(2))

Unnamed: 0,id,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,330,61,36.385,1,48517.56315,True,False,False,True,True,False,False,False
1,748,47,36.0,1,8556.907,True,False,True,False,False,False,False,True


Unnamed: 0,id,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,508,24,25.27,0,True,False,True,False,True,False,False,False
1,1309,41,32.2,2,False,True,True,False,False,False,False,True


In [159]:
# 스케일링
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# cols = ['bmi']
# train[cols] = scaler.fit_transform(train[cols])
# test[cols] = scaler.transform(test[cols])
# train.head()

# 수치형 데이터 스케일링도 추가로 해보자
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols = ['age', 'bmi']
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.fit_transform(test[cols])

In [160]:
# train['charges'].hist()

In [161]:
# import numpy as np
# train['charges'] = np.log1p(train['charges'])

In [162]:
# train['charges'].hist()

## 검증 데이터 분리

In [163]:
from sklearn.model_selection import train_test_split

#주어진 train 데이터는 x_train과 y_train이 나눠져 있지 않음
# 따라서 drop활용해서 분리해주기


X_tr, X_val, y_tr, y_val = train_test_split(train.drop('charges',axis=1),
                                            train['charges'],
                                            test_size=0.15,
                                            random_state = 2022)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((909, 12), (161, 12), (909,), (161,))

In [164]:
X_tr.head(1)

Unnamed: 0,id,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
1009,41,-0.576854,0.975055,2,True,False,True,False,False,False,True,False


In [165]:
y_tr.head(1)

1009    4949.7587
Name: charges, dtype: float64

In [166]:
# 평가 수식
# from sklearn.metrics import mean_squared_error
# import numpy as np
# # np.sqrt(16)
# def rmse(y_test, pred):
#     return np.sqrt(mean_squared_error(y_test, pred))



#rmse로 평가해야함 -> 그러나 sklearn에서 지원 X
#단 mse는 지원해주니 이걸 활용!!!
#루트 -> numpy 활용

from sklearn.metrics import mean_squared_error
import numpy as np
def rmse(y_test, pred): # 실제값 , 예측값
  return np.sqrt(mean_squared_error(y_test, pred))

In [167]:
# LinearRegression (회귀모델)
# from sklearn.linear_model import LinearRegression
# model = LinearRegression()
# model.fit(X_tr, y_tr)
# pred = model.predict(X_val)
# rmse(np.exp(y_val), np.exp(pred)) log 적용했을 경우 다시 돌려놔야함


from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_tr, y_tr)
pred = model.predict(X_val)
rmse(y_val, pred)

# 5888.058022365329 베이스라인
# standard scaler 적용후 5888.05802236555

5888.058022365557

In [146]:
# RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_tr, y_tr)
pred = model.predict(X_val)
rmse(np.exp(y_val), np.exp(pred))
# 4691.498598226366 베이스라인 -> 얘가 더 좋은 듯
# 4722.06683465538 스탠다드스켈러
# 4705.862619729388 민맥스 스켈러
# 4627.730520367604 log -> charge값이 분포가 더 커서 해당 칼럼 log 적용

4637.505576487096

In [None]:
# xgboost Regressor
# from xgboost import XGBRegressor
# model = XGBRegressor(objective='reg:squarederror')
# model.fit(X_tr, y_tr)
# pred = model.predict(X_val)
# rmse(np.exp(y_val), np.exp(pred))

In [None]:
# test 데이터 예측
pred = model.predict(test)

In [None]:
# 제출용 데이터 프레임
submit = pd.DataFrame(
            {
                'id':test['id'],
                'charges':np.exp(pred)
            }
        )

In [None]:
# csv파일 생성
submit.to_csv("1111.csv", index=False)

## 평가
- 수험자는 알 수 없는 영역
- 영상에는 np.exp()가 빠져있습니다. np.exp(pred)가 정확한 코드입니다.

In [None]:
y_test = pd.read_csv("y_test.csv")
rmse(y_test['charges'], np.exp(pred))

4723.104661268003