In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df = pd.read_csv('insurance.csv')

df['smoker_num'] = df['smoker'].map({'yes': 1, 'no': 0})

X = df[['age', 'bmi', 'children', 'smoker_num']]
y = df['charges']

print("Перші 5 рядків:")
print(df.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f} $")
print(f"RMSE: {rmse:.2f} $")
print(f"R2 Score: {r2:.2f}")

coef_df = pd.DataFrame({'Фактор': X.columns, 'Вага': model.coef_})
print("\nВплив факторів:")
print(coef_df)

# Візуалізація (Факт vs Прогноз)
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5, color='purple')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Реальна ціна')
plt.ylabel('Прогноз')
plt.title('Kaggle Dataset (Insurance)')
plt.show()