In [1]:
import numpy as np
import pandas as pd

# 1. 載入 insurance.csv 為 pandas DataFrame 格式 (0%)

In [2]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# 2. 資料前處理 (e.g., feature encoding) (10%)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## 2.1 處理重複值

In [4]:
duplicateRows = df[df.duplicated()]
duplicateRows

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
581,19,male,30.59,0,no,northwest,1639.5631


In [5]:
df.drop_duplicates(inplace=True)

## 2.2 Feature encoding

In [6]:
df = pd.get_dummies(data=df, columns=['sex', 'smoker', 'region'])
df.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


In [7]:
df.drop(['sex_female', 'smoker_no'], axis=1)

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,0,1,0,0,0,1
1,18,33.770,1,1725.55230,1,0,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,0,1,0
3,33,22.705,0,21984.47061,1,0,0,1,0,0
4,32,28.880,0,3866.85520,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,0,1,0,0
1334,18,31.920,0,2205.98080,0,0,1,0,0,0
1335,18,36.850,0,1629.83350,0,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,0,1


## 2.3 normalization

In [8]:
df_z_scaled = df.copy()
for column in df.columns:
    df_z_scaled[column] = (df[column] - df[column].mean()) / df[column].std()
display(df_z_scaled)

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,-1.439879,-0.452990,-0.908894,0.297745,1.009393,-1.009393,-1.968924,1.968924,-0.565334,-0.565334,-0.611409,1.763949
1,-1.511082,0.509231,-0.079412,-0.954024,-0.989953,0.989953,0.507512,-0.507512,-0.565334,-0.565334,1.634343,-0.566486
2,-0.799051,0.383011,1.579552,-0.729100,-0.989953,0.989953,0.507512,-0.507512,-0.565334,-0.565334,1.634343,-0.566486
3,-0.443036,-1.304564,-0.908894,0.718835,-0.989953,0.989953,0.507512,-0.507512,-0.565334,1.767542,-0.611409,-0.566486
4,-0.514239,-0.292347,-0.908894,-0.777208,-0.989953,0.989953,0.507512,-0.507512,-0.565334,1.767542,-0.611409,-0.566486
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,0.767417,0.050250,1.579552,-0.221180,-0.989953,0.989953,0.507512,-0.507512,-0.565334,1.767542,-0.611409,-0.566486
1334,-1.511082,0.205976,-0.908894,-0.914353,1.009393,-1.009393,0.507512,-0.507512,1.767542,-0.565334,-0.611409,-0.566486
1335,-1.511082,1.014110,-0.908894,-0.961927,1.009393,-1.009393,0.507512,-0.507512,-0.565334,-0.565334,1.634343,-0.566486
1336,-1.297473,-0.797226,-0.908894,-0.930705,1.009393,-1.009393,0.507512,-0.507512,-0.565334,-0.565334,-0.611409,1.763949


In [9]:
df_z_scaled.corr()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
age,1.0,0.109344,0.041536,0.298308,0.019814,-0.019814,0.025587,-0.025587,0.001868,0.001495,-0.012311,0.009415
bmi,0.109344,1.0,0.012755,0.198401,-0.046397,0.046397,-0.003746,0.003746,-0.138178,-0.136138,0.270057,-0.006211
children,0.041536,0.012755,1.0,0.067389,-0.017848,0.017848,-0.007331,0.007331,-0.023202,0.026044,-0.023492,0.021538
charges,0.298308,0.198401,0.067389,1.0,-0.058044,0.058044,-0.787234,0.787234,0.005945,-0.038695,0.073578,-0.043637
sex_female,0.019814,-0.046397,-0.017848,-0.058044,1.0,-1.0,0.076596,-0.076596,0.002008,0.012482,-0.017578,0.003767
sex_male,-0.019814,0.046397,0.017848,0.058044,-1.0,1.0,-0.076596,0.076596,-0.002008,-0.012482,0.017578,-0.003767
smoker_no,0.025587,-0.003746,-0.007331,-0.787234,0.076596,-0.076596,1.0,-1.0,-0.002597,0.036321,-0.068282,0.037168
smoker_yes,-0.025587,0.003746,0.007331,0.787234,-0.076596,0.076596,-1.0,1.0,0.002597,-0.036321,0.068282,-0.037168
region_northeast,0.001868,-0.138178,-0.023202,0.005945,0.002008,-0.002008,-0.002597,0.002597,1.0,-0.319842,-0.345909,-0.320493
region_northwest,0.001495,-0.136138,0.026044,-0.038695,0.012482,-0.012482,0.036321,-0.036321,-0.319842,1.0,-0.345909,-0.320493


# 3. 將資料集劃分為訓練集與測試集 (2%)

In [10]:
X = df_z_scaled.drop(['charges'], axis=1)
y = df_z_scaled['charges']

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 4. 建立並訓練以下三種迴歸模型：Linear Regression、SVM (SVR)、Decision Tree (10%)


## 4.1 Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg = reg.fit(X_train, y_train)
reg_prediction = reg.predict(X_test)

## 4.2 SVR

In [13]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
svr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
svr = svr.fit(X_train, y_train)
svr_prediction = svr.predict(X_test)

## 4.3 Decision Tree

In [14]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(random_state=0)
tree = tree.fit(X_train, y_train)
tree_prediction = tree.predict(X_test)

# 5. 請以 R2 (coefficient of determination)*、RSME (root mean square error)、MAE (mean absolute error)評估/比較三個模型在訓練資料及測試資料上的表現。(8%)


In [15]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [16]:
def getModelScore(model_prediction):
    return [r2_score(y_test, model_prediction), mean_squared_error(y_test, model_prediction), mean_absolute_error(y_test, model_prediction)]

In [17]:
reg_list = getModelScore(reg_prediction)
svr_list = getModelScore(svr_prediction)
tree_list = getModelScore(tree_prediction)

In [18]:
data = [reg_list, svr_list, tree_list]
score_df = pd.DataFrame(data, index=['Linear Regression', 'SVM (SVR)', 'Decision Tree'], columns = ['R2', 'RSME', "MAE"])
score_df

Unnamed: 0,R2,RSME,MAE
Linear Regression,0.762362,0.265344,0.354092
SVM (SVR),0.846765,0.1711,0.291938
Decision Tree,0.673509,0.364556,0.303671
