In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_squared_error, r2_score


In [7]:
df = pd.read_csv('../Data/insurance (1).csv')

In [8]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [9]:
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   bmi               1338 non-null   float64
 2   children          1338 non-null   int64  
 3   charges           1338 non-null   float64
 4   sex_male          1338 non-null   bool   
 5   smoker_yes        1338 non-null   bool   
 6   region_northwest  1338 non-null   bool   
 7   region_southeast  1338 non-null   bool   
 8   region_southwest  1338 non-null   bool   
dtypes: bool(5), float64(2), int64(2)
memory usage: 48.5 KB


In [13]:
X = df.drop('charges', axis=1)
y = df['charges']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
model = Pipeline([
    ('scaler', StandardScaler()),
    ('poly_feature', PolynomialFeatures()),
    ('regressor', LinearRegression())
])

In [16]:
model.fit(X_train, y_train)

In [17]:
y_pred = model.predict(X_test)

In [18]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE = {mae:.2f}')
print(f'MSE = {mse:.2f}')
print(f'RMSE = {rmse:.2f}')
print(f'R^2 = {r2:.2f}')

MAE = 2729.50
MSE = 20712805.99
RMSE = 4551.13
R^2 = 0.87


In [19]:
n = X.shape[0]
p = X.shape[1]

adjusted_r_squared = 1 - (1 - r2) / ((n - 1)/(n - p - 1))

print(f'Adjusted R^2 = {adjusted_r_squared:.2f}')

Adjusted R^2 = 0.87


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Memuat dataset
df = pd.read_csv('../Data/insurance (1).csv')

# Preprocessing
# Mengubah variabel kategorikal menjadi numerik menggunakan one-hot encoding
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)

# Mendefinisikan fitur (X) dan target (y)
X = df.drop('charges', axis=1)
y = df['charges']

# Memisahkan data menjadi set pelatihan (training) dan pengujian (testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Membuat pipeline untuk standard scaling, fitur polinomial, dan regresi linear
model = Pipeline([
    ('scaler', StandardScaler()),
    ('poly_features', PolynomialFeatures(degree=2)),
    ('regressor', LinearRegression())
])

# Melatih model
model.fit(X_train, y_train)

# Membuat prediksi pada data pengujian
y_pred = model.predict(X_test)

# Mengevaluasi model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.4f}")

Mean Absolute Error (MAE): 2729.50
Mean Squared Error (MSE): 20712805.99
Root Mean Squared Error (RMSE): 4551.13
R-squared (R2): 0.8666


In [2]:
n = X.shape[0]  # Number of rows in X
p = X.shape[1]  # Number of columns in X (independent variables)

# Calculate adjusted R-squared
adjusted_r_squared = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

print(f"Standard R-squared: {r2}")
print(f"Adjusted R-squared: {adjusted_r_squared}")

Standard R-squared: 0.866583090316484
Adjusted R-squared: 0.8657799787457781


In [3]:
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

In [4]:
mape

np.float64(30.198578022714617)