In [None]:
# --- Cell 1: Imports + robust dataset loader ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

CANDIDATE_URLS = [
    # Original FCC repo (may not host the CSV at root)
    "https://raw.githubusercontent.com/freeCodeCamp/boilerplate-linear-regression-health-costs-calculator/main/insurance.csv",
    "https://raw.githubusercontent.com/freeCodeCamp/boilerplate-linear-regression-health-costs-calculator/master/insurance.csv",
    "https://raw.githubusercontent.com/freeCodeCamp/boilerplate-linear-regression-health-costs-calculator/main/data/insurance.csv",
    # Known good public mirror
    "https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv",
]

df = None
for u in CANDIDATE_URLS:
    try:
        df = pd.read_csv(u, encoding="latin1")
        print(f"Loaded dataset from: {u}")
        break
    except Exception as e:
        print(f"Failed to load {u}: {e}")

if df is None:
    raise RuntimeError("Could not load 'insurance.csv'. Upload it to /content and use pd.read_csv('/content/insurance.csv').")

# Normalise target name: some versions use `charges` instead of `expenses`
if "expenses" not in df.columns and "charges" in df.columns:
    df = df.rename(columns={"charges": "expenses"})

print(df.head())
print("\nDataset shape:", df.shape)

In [None]:
# Convert categorical columns to numeric dummies
df = pd.get_dummies(df, drop_first=True)
print("Converted categorical features to numeric.")
df.head()

In [None]:
X = df.drop("expenses", axis=1)
y = df["expenses"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

print("Model trained.")
print("Intercept:", model.intercept_)
print("Number of coefficients:", len(model.coef_))

In [None]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

if mae < 3500:
    print("✅ Challenge passed! MAE < 3500.")
else:
    print("❌ MAE >= 3500. Try re-running or improving preprocessing.")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.5, color='teal')
max_val = max(y_test.max(), y_pred.max())
plt.plot([0, max_val], [0, max_val], 'r--', label='Ideal')
plt.title("Actual vs Predicted Healthcare Expenses")
plt.xlabel("Actual expenses")
plt.ylabel("Predicted expenses")
plt.legend()
plt.show()