In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [9]:
# Load all data
df = pd.read_csv("../data/ObesityDataSet_raw_and_data_sinthetic.csv")
X = df.drop(columns=["NObeyesdad"])
y = df["NObeyesdad"]

# Encode the full dataset before splitting
X_encoded = pd.get_dummies(X, columns=[
    "Gender", "family_history_with_overweight", "FAVC",
    "CAEC", "SMOKE", "SCC", "CALC", "MTRANS"
])

# Save the column order BEFORE the train/test split
joblib.dump(X_encoded.columns.tolist(), "selected_feature_columns.pkl")

# Then scale and split:
numerical_cols = X_encoded.select_dtypes(include=["int64", "float64"]).columns
scaler = StandardScaler()
X_encoded[numerical_cols] = scaler.fit_transform(X_encoded[numerical_cols])

# Now split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [10]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)
print(f"✅ Test Accuracy: {acc:.4f}")

✅ Test Accuracy: 0.9314


In [11]:
joblib.dump(model, "model_rf33.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import joblib
import os

# 1. Load full dataset
df = pd.read_csv("../data/ObesityDataSet_raw_and_data_sinthetic.csv")
target_col = "NObeyesdad"
y = df[target_col]
X = df.drop(columns=[target_col])

# 2. One-hot encode BEFORE split
X_encoded = pd.get_dummies(X, columns=[
    "Gender", "family_history_with_overweight", "FAVC",
    "CAEC", "SMOKE", "SCC", "CALC", "MTRANS"
])

# 3. Save column names BEFORE train-test split
os.makedirs("../lab8app", exist_ok=True)
joblib.dump(X_encoded.columns.tolist(), "../lab8app/selected_feature_columns.pkl")

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# 5. Scale numerical columns (fit only on train, apply to both)
numerical_cols = X_train.select_dtypes(include=["float64", "int64"]).columns
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# 6. Save the fitted scaler
joblib.dump(scaler, "../lab8app/scaler.pkl")

# 7. Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 8. Save the model
joblib.dump(model, "../lab8app/model_rf33.pkl")

# 9. Evaluate
test_accuracy = model.score(X_test, y_test)
print(f"✅ Test Accuracy: {test_accuracy:.4f}")

✅ Test Accuracy: 0.9314


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import joblib
import os

df = pd.read_csv("../data/ObesityDataSet_raw_and_data_sinthetic.csv")
X = df.drop(columns=["NObeyesdad"])
y = df["NObeyesdad"]

# Encode all possible dummy variables
X_encoded = pd.get_dummies(X, columns=[
    "Gender", "family_history_with_overweight", "FAVC",
    "CAEC", "SMOKE", "SCC", "CALC", "MTRANS"
])

# 🔥 Save the full column list before splitting
os.makedirs("../lab8app", exist_ok=True)
joblib.dump(X_encoded.columns.tolist(), "../lab8app/selected_feature_columns.pkl")

# Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Scale numeric columns
numeric_cols = X_train.select_dtypes(include=["float64", "int64"]).columns
# After you select numeric_cols
joblib.dump(numerical_cols.tolist(), "../lab8app/numeric_columns.pkl")
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
joblib.dump(scaler, "../lab8app/scaler.pkl")

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
joblib.dump(model, "../lab8app/model_rf33.pkl")

# Evaluate
print("✅ Test accuracy:", model.score(X_test, y_test))

✅ Test accuracy: 0.9314420803782506


In [22]:
X_train.columns

Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender_Female', 'Gender_Male', 'family_history_with_overweight_no',
       'family_history_with_overweight_yes', 'FAVC_no', 'FAVC_yes',
       'CAEC_Always', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no',
       'SMOKE_no', 'SMOKE_yes', 'SCC_no', 'SCC_yes', 'CALC_Always',
       'CALC_Frequently', 'CALC_Sometimes', 'CALC_no', 'MTRANS_Automobile',
       'MTRANS_Bike', 'MTRANS_Motorbike', 'MTRANS_Public_Transportation',
       'MTRANS_Walking'],
      dtype='object')