In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Load and Inspect
df = pd.read_csv("patient_health_data.csv")
print(f"Shape: {df.shape}")

# Preprocessing
le = LabelEncoder()
df['smoking_status'] = le.fit_transform(df['smoking_status'])

# Modeling
X = df[['age', 'bmi', 'blood_pressure', 'cholesterol', 'glucose', 'insulin', 'heart_rate', 'activity_level', 'diet_quality', 'smoking_status', 'alcohol_intake']]
y = df['health_risk_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
print(f"Linear R2: {r2_score(y_test, lr_model.predict(X_test))}")

# Save the final model and encoder
joblib.dump(lr_model, "health_risk_model.pkl")
joblib.dump(le, "smoking_le.pkl")

Shape: (250, 12)
Linear R2: 0.7643620906757488


['smoking_le.pkl']