In [58]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../input/insurance/insurance.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='charges')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 7), (268, 7), (1070, 2), (268, 2))

In [59]:
X_train.head()

Unnamed: 0,id,age,sex,bmi,children,smoker,region
209,209,40,male,41.23,1,no,northeast
540,540,34,female,38.0,3,no,southwest
747,747,19,male,21.755,0,no,northwest
39,39,60,male,39.9,0,yes,southwest
640,640,33,male,42.4,5,no,southwest


In [60]:
y_train.head()

Unnamed: 0,id,charges
209,209,6610.1097
540,540,6196.448
747,747,1627.28245
39,39,48173.361
640,640,6666.243


In [61]:
# 결측값 확인
print(X_train.isnull().sum(), end='\n\n')
print(X_test.isnull().sum())

id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64


In [62]:
# 범주형 변수 확인
print(X_train['sex'].value_counts(), end='\n\n')
print(X_train['smoker'].value_counts(), end='\n\n')
print(X_train['region'].value_counts())


male      551
female    519
Name: sex, dtype: int64

no     845
yes    225
Name: smoker, dtype: int64

southeast    304
northeast    266
southwest    261
northwest    239
Name: region, dtype: int64


In [63]:
# 범주형 변수 원핫인코딩
X_train = pd.get_dummies(X_train)
X_train.head()

Unnamed: 0,id,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
209,209,40,41.23,1,0,1,1,0,1,0,0,0
540,540,34,38.0,3,1,0,1,0,0,0,0,1
747,747,19,21.755,0,0,1,1,0,0,1,0,0
39,39,60,39.9,0,0,1,0,1,0,0,0,1
640,640,33,42.4,5,0,1,1,0,0,0,0,1


In [64]:
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0,id,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
1088,1088,52,47.74,1,0,1,1,0,0,0,1,0
1157,1157,23,23.18,2,1,0,1,0,0,1,0,0
1267,1267,24,31.065,0,0,1,0,1,1,0,0,0
506,506,22,31.35,1,0,1,1,0,0,1,0,0
659,659,57,28.785,4,1,0,1,0,1,0,0,0


In [65]:
# 정규화
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train[['bmi']])
X_train['bmi'] = scaler.transform(X_train[['bmi']])
X_test['bmi'] = scaler.transform(X_test[['bmi']])

X_train.head()

Unnamed: 0,id,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
209,209,40,1.707232,1,0,1,1,0,1,0,0,0
540,540,34,1.180775,3,1,0,1,0,0,0,0,1
747,747,19,-1.466991,0,0,1,1,0,0,1,0,0
39,39,60,1.490456,0,0,1,0,1,0,0,0,1
640,640,33,1.89793,5,0,1,1,0,0,0,0,1


In [66]:
# train/test split
X_train = X_train.drop('id', axis=1)
X_test = X_test.drop('id', axis=1)

from sklearn.model_selection import train_test_split

X_, X_val, y_, y_val = train_test_split(X_train, y_train['charges'], test_size=0.1)

print(X_.shape, X_val.shape, y_.shape, y_val.shape)

(963, 11) (107, 11) (963,) (107,)


In [69]:
# 회귀
from sklearn.ensemble import RandomForestRegressor

rf_r = RandomForestRegressor(max_depth=4)
rf_r.fit(X_, y_)
pred = rf_r.predict(X_test)

print('훈련 정확도:', rf_r.score(X_, y_))
print('테스트 정확도:', rf_r.score(X_val, y_val))

훈련 정확도: 0.8834737643114162
테스트 정확도: 0.8175881579800086


In [73]:
# rmse
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test['charges'], pred))
print(rmse)

4588.6802804652625


In [74]:
# 결과 저장
result = pd.DataFrame({'id':y_test['id'], 'pred':pred})
result.to_csv('/kaggle/working/result.csv', index=False)