In [18]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load datasets with exception handling
try:
    data = pd.read_csv('static/obesity_data.csv')
except FileNotFoundError as e:
    print(f"Error loading file: {e}")
    exit()

# Preview the data
print(data.head())
print(data.info())

# Handle missing values
data = data.dropna()
if mental_data.empty:
    print("Obesity dataset is empty after dropping NaN values.")
    exit()

# Define feature columns and target column
feature_columns = ['Age','Gender','Height','Weight','BMI','PhysicalActivityLevel','ObesityCategory'
]
target_column = 'ObesityCategory'

# Check if columns exist
missing_columns = [col for col in feature_columns + [target_column] if col not in mental_data.columns]
if missing_columns:
    print(f"Missing columns in dataset: {missing_columns}")
    exit()

# Encode categorical variables
encoder = LabelEncoder()
for col in feature_columns:
    data[col] = encoder.fit_transform(data[col])

data[target_column] = encoder.fit_transform(data[target_column])

# Split into features and target
X_data = data[feature_columns]
y_data = data[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Save the model and encoder
joblib.dump(model, 'static/models/obesity.pkl')
joblib.dump(encoder, 'static/encoders/obesity_encoder.pkl')

# Save the accuracy score
with open('static/models/obesity_metrics.txt', 'w') as f:
    f.write(f"Accuracy: {accuracy}\n")


   Age  Gender      Height     Weight        BMI  PhysicalActivityLevel  \
0   56    Male  173.575262  71.982051  23.891783                      4   
1   69    Male  164.127306  89.959256  33.395209                      2   
2   46  Female  168.072202  72.930629  25.817737                      4   
3   32    Male  168.459633  84.886912  29.912247                      3   
4   60    Male  183.568568  69.038945  20.487903                      3   

  ObesityCategory  
0   Normal weight  
1           Obese  
2      Overweight  
3      Overweight  
4   Normal weight  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    1000 non-null   int64  
 1   Gender                 1000 non-null   object 
 2   Height                 1000 non-null   float64
 3   Weight                 1000 non-null   float64
 4   BMI    