In [2]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../input/insurance/insurance.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='charges')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 7), (268, 7), (1070, 2), (268, 2))

# EDA

In [3]:
print(X_train.shape, X_test.shape, y_train.shape)

(1070, 7) (268, 7) (1070, 2)


In [4]:
print(X_train.head())

      id  age     sex     bmi  children smoker     region
209  209   40    male  41.230         1     no  northeast
540  540   34  female  38.000         3     no  southwest
747  747   19    male  21.755         0     no  northwest
39    39   60    male  39.900         0    yes  southwest
640  640   33    male  42.400         5     no  southwest


In [5]:
print(X_test.head())

        id  age     sex     bmi  children smoker     region
1088  1088   52    male  47.740         1     no  southeast
1157  1157   23  female  23.180         2     no  northwest
1267  1267   24    male  31.065         0    yes  northeast
506    506   22    male  31.350         1     no  northwest
659    659   57  female  28.785         4     no  northeast


In [6]:
print(y_train)

        id      charges
209    209   6610.10970
540    540   6196.44800
747    747   1627.28245
39      39  48173.36100
640    640   6666.24300
...    ...          ...
44      44   6079.67150
621    621  40182.24600
1152  1152  40941.28540
57      57  34303.16720
1140  1140   9048.02730

[1070 rows x 2 columns]


In [7]:
# 결측치 확인
print(X_train.isnull().sum())

id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64


In [8]:
print(X_test.isnull().sum())

id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64


# 전처리

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1070 entries, 209 to 1140
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        1070 non-null   int64  
 1   age       1070 non-null   int64  
 2   sex       1070 non-null   object 
 3   bmi       1070 non-null   float64
 4   children  1070 non-null   int64  
 5   smoker    1070 non-null   object 
 6   region    1070 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 66.9+ KB


In [10]:
print(X_test['region'].unique())

['southeast' 'northwest' 'northeast' 'southwest']


In [11]:
col = X_train.select_dtypes('object').columns
print(col)

Index(['sex', 'smoker', 'region'], dtype='object')


In [12]:
# 원 핫 인코딩

X_train = pd.get_dummies(X_train, columns = col)
X_test = pd.get_dummies(X_test, columns=col)
print(X_train.head(3))

      id  age     bmi  children  sex_female  sex_male  smoker_no  smoker_yes  \
209  209   40  41.230         1           0         1          1           0   
540  540   34  38.000         3           1         0          1           0   
747  747   19  21.755         0           0         1          1           0   

     region_northeast  region_northwest  region_southeast  region_southwest  
209                 1                 0                 0                 0  
540                 0                 0                 0                 1  
747                 0                 1                 0                 0  


In [13]:
X_test_id = X_test.pop('id')
print(X_test.head(1))
X_train = X_train.drop('id', axis=1)
print(X_train.head(1))

      age    bmi  children  sex_female  sex_male  smoker_no  smoker_yes  \
1088   52  47.74         1           0         1          1           0   

      region_northeast  region_northwest  region_southeast  region_southwest  
1088                 0                 0                 1                 0  
     age    bmi  children  sex_female  sex_male  smoker_no  smoker_yes  \
209   40  41.23         1           0         1          1           0   

     region_northeast  region_northwest  region_southeast  region_southwest  
209                 1                 0                 0                 0  


# 스케일링


In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train['bmi'] = scaler.fit_transform(X_train[['bmi']])
X_test['bmi'] = scaler.transform(X_test[['bmi']])

print(X_train.head(2))
print(X_test.head(2))

     age       bmi  children  sex_female  sex_male  smoker_no  smoker_yes  \
209   40  1.707232         1           0         1          1           0   
540   34  1.180775         3           1         0          1           0   

     region_northeast  region_northwest  region_southeast  region_southwest  
209                 1                 0                 0                 0  
540                 0                 0                 0                 1  
      age       bmi  children  sex_female  sex_male  smoker_no  smoker_yes  \
1088   52  2.768295         1           0         1          1           0   
1157   23 -1.234731         2           1         0          1           0   

      region_northeast  region_northwest  region_southeast  region_southwest  
1088                 0                 0                 1                 0  
1157                 0                 1                 0                 0  


In [15]:
# age 바꾸기

X_train['age'] = X_train['age'].apply(lambda x: x//10)
X_test['age'] = X_test['age'].apply(lambda x: x//10)

X_train.head(3)

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
209,4,1.707232,1,0,1,1,0,1,0,0,0
540,3,1.180775,3,1,0,1,0,0,0,0,1
747,1,-1.466991,0,0,1,1,0,0,1,0,0


In [16]:
y = y_train['charges']
y.head()

209     6610.10970
540     6196.44800
747     1627.28245
39     48173.36100
640     6666.24300
Name: charges, dtype: float64

# train, validation set 나누기

In [17]:
from sklearn.model_selection import train_test_split

X_tr,X_val,y_tr,y_val = train_test_split(X_train, y, test_size = 0.15, random_state = 2023)
print(X_tr.shape,X_val.shape,y_tr.shape,y_val.shape )

(909, 11) (161, 11) (909,) (161,)


# 모델링

In [22]:
# 평가지수
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [23]:
# 랜덤포레스트

from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()
rfr.fit(X_tr, y_tr)
pred = rfr.predict(X_val)

print('RMSE:', rmse(y_val, pred))
print('r2 score:', r2_score(y_val,pred))

RMSE: 5795.799436126395
r2 score: 0.7308499988909947


In [24]:
# xgboost

from xgboost import XGBRegressor

XGB = XGBRegressor()
XGB.fit(X_tr, y_tr)
pred = XGB.predict(X_val)

print("RMSE:", rmse(y_val, pred))
print('r2 score:', r2_score(y_val,pred))

RMSE: 5957.384820219986
r2 score: 0.7156331296406618


In [26]:
rfr.fit(X_train, y)
pred = rfr.predict(X_test)

# Submission.csv 만들기

In [27]:
submission = pd.DataFrame({'id': X_test_id, 'charges': pred})
submission.to_csv('000600.csv', index=False)
print(submission.head())

        id       charges
1088  1088   9449.504769
1157  1157   4295.324765
1267  1267  34207.148164
506    506   2862.191114
659    659  15132.571928


In [28]:
check = pd.read_csv('000600.csv')
print(check.head())

     id       charges
0  1088   9449.504769
1  1157   4295.324765
2  1267  34207.148164
3   506   2862.191114
4   659  15132.571928


# 결과 채점

In [29]:
rmse(y_test['charges'], pred)

5023.646319279271