In [6]:
# Import library yang dibutuhkan
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pickle

In [7]:
# Baca dataset
df = pd.read_csv('insurance1.csv')

# Tampilkan informasi dataset
print(df.info())
print("\nDeskripsi Dataset:")
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       348 non-null    int64  
 1   sex       348 non-null    int64  
 2   bmi       348 non-null    float64
 3   children  348 non-null    int64  
 4   smoker    348 non-null    int64  
 5   charges   348 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 16.4 KB
None

Deskripsi Dataset:
              age         sex         bmi    children      smoker  \
count  348.000000  348.000000  348.000000  348.000000  348.000000   
mean    39.591954    0.508621   30.676552    1.091954    0.232759   
std     14.417015    0.500646    5.625850    1.192021    0.423198   
min     18.000000    0.000000   15.960000    0.000000    0.000000   
25%     27.000000    0.000000   26.782500    0.000000    0.000000   
50%     40.000000    1.000000   30.300000    1.000000    0.000000   
75%     53.000000    1.00

In [8]:
# Pisahkan fitur dan target
X = df.drop('charges', axis=1)
y = df['charges']

# Tampilkan bentuk data
print("Bentuk fitur (X):", X.shape)
print("Bentuk target (y):", y.shape)

Bentuk fitur (X): (348, 5)
Bentuk target (y): (348,)


In [9]:
# Split data training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Bentuk data training:", X_train.shape)
print("Bentuk data testing:", X_test.shape)

Bentuk data training: (278, 5)
Bentuk data testing: (70, 5)


In [10]:
# Scaling fitur
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data setelah scaling:")
print("Mean training:", X_train_scaled.mean(axis=0))
print("Std training:", X_train_scaled.std(axis=0))

Data setelah scaling:
Mean training: [-8.94568193e-17  0.00000000e+00  5.62300007e-16 -5.75079552e-17
 -1.43769888e-17]
Std training: [1. 1. 1. 1. 1.]


In [11]:
# Buat dan latih model regresi linear
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Evaluasi model
train_score = model.score(X_train_scaled, y_train)
test_score = model.score(X_test_scaled, y_test)

print(f"Train R2 Score: {train_score:.4f}")
print(f"Test R2 Score: {test_score:.4f}")

Train R2 Score: 0.7602
Test R2 Score: 0.7941


In [12]:
# Simpan model dan scaler
with open('insurance_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("Model dan scaler berhasil disimpan.")

Model dan scaler berhasil disimpan.


In [13]:
# Tampilkan koefisien model
fitur = X.columns
koefisien = model.coef_

for f, k in zip(fitur, koefisien):
    print(f"{f}: {k:.4f}")

age: 3775.3126
sex: 3.7983
bmi: 2022.0578
children: 732.4790
smoker: 9724.4864
