In [None]:
!pip install ydata-profiling

import pandas as pd
from ydata_profiling import ProfileReport

file_path = "train.csv"

df = pd.read_csv(file_path)

profile = ProfileReport(df, title = "Obeisty Risk Dataset Profiling Report", explorative=True)

profile.to_notebook_iframe()



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/18 [00:00<?, ?it/s][A
 22%|██▏       | 4/18 [00:00<00:00, 33.53it/s][A
 44%|████▍     | 8/18 [00:00<00:00, 25.61it/s][A
 61%|██████    | 11/18 [00:00<00:00, 17.06it/s][A
 72%|███████▏  | 13/18 [00:00<00:00, 14.93it/s][A
 89%|████████▉ | 16/18 [00:00<00:00, 16.11it/s][A
100%|██████████| 18/18 [00:01<00:00, 17.07it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
profile.to_file("obesity_risk_profiling.html")

from google.colab import files
files.download("obesity_risk_profiling.html")


Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(df.head())

   id  Gender        Age    Height      Weight family_history_with_overweight  \
0   0    Male  24.443011  1.699998   81.669950                            yes   
1   1  Female  18.000000  1.560000   57.000000                            yes   
2   2  Female  18.000000  1.711460   50.165754                            yes   
3   3  Female  20.952737  1.710730  131.274851                            yes   
4   4    Male  31.641081  1.914186   93.798055                            yes   

  FAVC      FCVC       NCP        CAEC SMOKE      CH2O SCC       FAF  \
0  yes  2.000000  2.983297   Sometimes    no  2.763573  no  0.000000   
1  yes  2.000000  3.000000  Frequently    no  2.000000  no  1.000000   
2  yes  1.880534  1.411685   Sometimes    no  1.910378  no  0.866045   
3  yes  3.000000  3.000000   Sometimes    no  1.674061  no  1.467863   
4  yes  2.679664  1.971472   Sometimes    no  1.979848  no  1.967973   

        TUE       CALC                 MTRANS       WeightCategory  
0  0.976473

In [5]:
import pandas as pd
import joblib  # <-- 1. NEW IMPORT
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# --- 1. Load Data & Feature Engineering ---
print("Creating BMI feature...")
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
df['BMI'] = df['Weight'] / (df['Height']**2)
test_df['BMI'] = test_df['Weight'] / (test_df['Height']**2)
print("BMI feature created successfully.")
print("-" * 30)

# --- 2. Handle 'id' Column and Create X/y ---
test_ids = test_df['id']
df = df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)
print("Successfully dropped 'id' column.")
print("-" * 30)

y_train = df['WeightCategory']
X_train = df.drop('WeightCategory', axis=1)

# --- 3. Define Column Lists ---
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
numerical_cols = X_train.select_dtypes(include=['number']).columns
print("Categorical columns:")
print(categorical_cols)
print("\nNumerical columns (now includes BMI):")
print(numerical_cols)
print("-" * 30)

# --- 4. Create Preprocessing Pipeline ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# --- 5. Fit Preprocessor and Transform Training Data ---
X_train_processed = preprocessor.fit_transform(X_train)
print(f"Preprocessing complete! Training data shape: {X_train_processed.shape}")
print("-" * 30)

# --- 6. Encode Target Variable (y) ---
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# --- 7. Hyperparameter Tuning with GridSearchCV ---
print("Starting hyperparameter tuning with GridSearchCV...")
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0]
}
xgb = XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', random_state=42)
grid_search = GridSearchCV(estimator=xgb,
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           verbose=2,
                           scoring='accuracy')
grid_search.fit(X_train_processed, y_train_encoded)

print("Tuning complete!")
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found on validation data: ", grid_search.best_score_)
print("-" * 30)

# --- Get the best model from the grid search ---
model = grid_search.best_estimator_

# --- 7.5. NEW: Save Model (PKL Dump) & Get Feature Importances ---

print("Saving model and preprocessor...")
# "Dumping" the model to a .pkl file
joblib.dump(model, 'xgb_best_model.pkl')
# We must ALSO save the preprocessor to reproduce our results
joblib.dump(preprocessor, 'preprocessor.pkl')
print("Successfully saved 'xgb_best_model.pkl' and 'preprocessor.pkl'")
print("-" * 30)

print("Getting feature importances...")
# Get importance scores from the best model
importances = model.feature_importances_
# Get the feature names from the preprocessor
feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame for easy viewing
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

# Sort by importance (highest first)
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# Save all importances to a CSV
feature_importance_df.to_csv('feature_importances.csv', index=False)
print("Successfully saved 'feature_importances.csv'")

print("\n--- Top 10 Most Important Features ---")
print(feature_importance_df.head(10))
print("-" * 30)

# --- 8. Process Test Data and Make Predictions ---
# We already have 'model' and 'preprocessor' loaded
test_df_processed = preprocessor.transform(test_df)
predictions_encoded = model.predict(test_df_processed)
predictions_original_labels = le.inverse_transform(predictions_encoded)

# --- 9. Create and Save Submission File ---
submission_dict = {
    'id': test_ids,
    'WeightCategory': predictions_original_labels
}
submission_df = pd.DataFrame(submission_dict)
submission_df.to_csv('submission_bmi_grid_search.csv', index=False)

print("Successfully created 'submission_bmi_grid_search.csv'!")
print(submission_df.head())

Creating BMI feature...
BMI feature created successfully.
------------------------------
Successfully dropped 'id' column.
------------------------------
Categorical columns:
Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS'],
      dtype='object')

Numerical columns (now includes BMI):
Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'BMI'], dtype='object')
------------------------------
Preprocessing complete! Training data shape: (15533, 31)
------------------------------
Starting hyperparameter tuning with GridSearchCV...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Tuning complete!
Best parameters found:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best accuracy found on validation data:  0.9033672024744043
------------------------------
Saving model and preprocessor...
Successfully saved 'xgb_best_model.pkl' and 'preprocessor.pkl'
----------------------