In [19]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../../Datasat/insurance/insurance.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='charges')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 7), (268, 7), (1070, 2), (268, 2))

In [20]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1070 entries, 209 to 1140
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        1070 non-null   int64  
 1   age       1070 non-null   int64  
 2   sex       1070 non-null   object 
 3   bmi       1070 non-null   float64
 4   children  1070 non-null   int64  
 5   smoker    1070 non-null   object 
 6   region    1070 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 66.9+ KB


In [21]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 268 entries, 1088 to 116
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        268 non-null    int64  
 1   age       268 non-null    int64  
 2   sex       268 non-null    object 
 3   bmi       268 non-null    float64
 4   children  268 non-null    int64  
 5   smoker    268 non-null    object 
 6   region    268 non-null    object 
dtypes: float64(1), int64(3), object(3)
memory usage: 16.8+ KB


In [22]:
# X_train.isnull().sum()
X_test.isnull().sum()

id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

In [23]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [24]:
X_train.head()

Unnamed: 0,id,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
209,209,40,41.23,1,False,True,True,False,True,False,False,False
540,540,34,38.0,3,True,False,True,False,False,False,False,True
747,747,19,21.755,0,False,True,True,False,False,True,False,False
39,39,60,39.9,0,False,True,False,True,False,False,False,True
640,640,33,42.4,5,False,True,True,False,False,False,False,True


In [25]:
X_train = X_train.drop(columns=['id'],axis=1)
X_test = X_test.drop(columns=['id'],axis=1)


In [26]:
X_train.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
209,40,41.23,1,False,True,True,False,True,False,False,False
540,34,38.0,3,True,False,True,False,False,False,False,True
747,19,21.755,0,False,True,True,False,False,True,False,False
39,60,39.9,0,False,True,False,True,False,False,False,True
640,33,42.4,5,False,True,True,False,False,False,False,True


In [27]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1070 entries, 209 to 1140
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       1070 non-null   int64  
 1   charges  1070 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 25.1 KB


In [28]:
y = y_train['charges']

In [31]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train['charges'])

pred = model.predict(X_test)
pred

array([12870.8783869 ,  6284.3501011 , 34266.74027   ,  3325.6990602 ,
       17323.7293313 ,  4304.1916129 ,  9931.982062  , 15449.4416344 ,
        2273.352661  , 14506.840433  , 14582.4059046 , 12624.7959201 ,
       11791.1275721 , 10986.7418965 ,  3024.2654186 , 16920.0235832 ,
       14217.0732729 ,  3084.7879    ,  2451.2641315 , 12701.9572333 ,
       10271.4883215 ,  7315.3153929 ,  2194.5090352 , 19287.4984108 ,
        9458.7020215 , 11701.5371876 ,  5479.503787  ,  6899.4553631 ,
       16909.4527801 ,  4797.5220337 ,  6990.4827065 ,  5852.277905  ,
       14836.4998108 , 17008.3572991 ,  8689.5739998 , 14623.69584283,
        3490.2156302 , 42165.0603206 ,  8910.8907929 ,  8912.0568775 ,
       11504.25474   ,  1741.2711285 ,  2979.7941501 , 12738.9837897 ,
        9616.2578224 ,  2419.5654003 , 19464.4916835 , 14677.877838  ,
       44944.0190587 ,  3127.8851425 ,  5579.3033551 ,  4697.248055  ,
        9261.7403204 , 38934.816812  ,  1591.5973815 , 15506.6580843 ,
      

In [32]:
model.score(X_train, y)

0.9779286360417192