This cell imports additional libraries, including numpy and os, and sets a seed for reproducibility of random operations.

In [None]:
import numpy as np
import os

# Set a seed for reproducibility
SEED = 42
np.random.seed(SEED)
import pandas as pd
import numpy as np

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="whitegrid")

This cell mounts the Google Drive to access files stored there.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


This cell defines the file paths for the training, testing, and sample submission datasets and then loads them into pandas DataFrames.

In [None]:
# Define file paths based on the project structure
train_path = '/content/drive/MyDrive/ml_project/data/train.csv'
test_path = '/content/drive/MyDrive/ml_project/data/test.csv'
sample_submission_path = '/content/drive/MyDrive/ml_project/data/sample_submission.csv'

# Load the datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission_df = pd.read_csv(sample_submission_path)

In [None]:
# Apply feature engineering to both training and test sets for consistency
for dataset in [train_df, test_df]:
    dataset['BMI'] = dataset['Weight'] / (dataset['Height'] ** 2)


In [None]:
train_df["BMI"]

Unnamed: 0,BMI
0,28.259565
1,23.422091
2,17.126706
3,44.855798
4,25.599151
...,...
15528,17.301038
15529,17.861680
15530,17.450915
15531,26.146548


In [None]:
test_df['BMI']

Unnamed: 0,BMI
0,43.880091
1,26.002130
2,27.163625
3,38.335708
4,23.306680
...,...
5220,36.587084
5221,17.099278
5222,31.889841
5223,28.899693


In [None]:
categorical_cols = train_df.select_dtypes(include='object').columns
print(f"Categorical columns: {list(categorical_cols)}")

Categorical columns: ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'WeightCategory']


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

# Example setup
ordinal_features = ['CAEC', 'CALC']
onehot_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'MTRANS']

# Define ordinal mappings (order matters!)
caec_order = ['no', 'Sometimes', 'Frequently', 'Always']
calc_order = ['no', 'Sometimes', 'Frequently', 'Always']

ordinal_mapping = OrdinalEncoder(categories=[caec_order, calc_order])

# Fit and transform on train, only transform on test
train_df[ordinal_features] = ordinal_mapping.fit_transform(train_df[ordinal_features])
test_df[ordinal_features] = ordinal_mapping.transform(test_df[ordinal_features])

# One-Hot Encode nominal features
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded_train = encoder.fit_transform(train_df[onehot_features])
encoded_test = encoder.transform(test_df[onehot_features])

# Convert to DataFrame and align
encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(onehot_features))
encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(onehot_features))

# Drop old columns and join back
train_df = pd.concat([train_df.drop(columns=onehot_features), encoded_train_df], axis=1)
test_df = pd.concat([test_df.drop(columns=onehot_features), encoded_test_df], axis=1)
# Encode target variable
le = LabelEncoder()
train_df['WeightCategory'] = le.fit_transform(train_df['WeightCategory'])


In [None]:
train_df["WeightCategory"]

Unnamed: 0,WeightCategory
0,6
1,1
2,0
3,4
4,6
...,...
15528,0
15529,0
15530,0
15531,5


In [None]:
categorical_cols = train_df.select_dtypes(include='object').columns
print(f"Categorical columns: {list(categorical_cols)}")

Categorical columns: []


In [None]:
test_df.shape

(5225, 21)

In [None]:
sample_submission_df

Unnamed: 0,id,WeightCategory
0,20758,Normal_Weight
1,20759,Normal_Weight
2,20760,Normal_Weight
3,20761,Normal_Weight
4,20762,Normal_Weight
...,...,...
13835,34593,Normal_Weight
13836,34594,Normal_Weight
13837,34595,Normal_Weight
13838,34596,Normal_Weight


In [None]:
# Assuming le is the LabelEncoder used on WeightCategory
label_map = {
    0: 'Insufficient_Weight',
    1: 'Normal_Weight',
    2: 'Obesity_Type_I',
    3: 'Obesity_Type_II',
    4: 'Obesity_Type_III',
    5: 'Overweight_Level_I',
    6: 'Overweight_Level_II'
}

y_test_labels = [label_map[num] for num in y_test_pred]
label_map = {index: label for index, label in enumerate(le.classes_)}
print(label_map)



{0: 'Insufficient_Weight', 1: 'Normal_Weight', 2: 'Obesity_Type_I', 3: 'Obesity_Type_II', 4: 'Obesity_Type_III', 5: 'Overweight_Level_I', 6: 'Overweight_Level_II'}


In [None]:
from sklearn.preprocessing import StandardScaler

# List of numerical features
numeric_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE','BMI']

# Create scaler
scaler = StandardScaler()

# Fit on train data and transform
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])

# Transform test data using the same scaler (important: no .fit() on test)
test_df[numeric_features] = scaler.transform(test_df[numeric_features])


In [None]:
train_df.describe()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CAEC,CH2O,FAF,TUE,...,BMI,Gender_Male,family_history_with_overweight_yes,FAVC_yes,SMOKE_yes,SCC_yes,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
count,15533.0,15533.0,15533.0,15533.0,15533.0,15533.0,15533.0,15533.0,15533.0,15533.0,...,15533.0,15533.0,15533.0,15533.0,15533.0,15533.0,15533.0,15533.0,15533.0,15533.0
mean,7766.0,-2.012739e-17,-5.489289e-18,2.6531560000000002e-17,-2.6531560000000002e-17,2.287204e-18,1.151098,2.058483e-18,-2.927621e-17,-7.090332e-18,...,4.181009e-16,0.501062,0.817357,0.913153,0.011395,0.033091,0.001545,0.001931,0.802807,0.021889
std,4484.135201,1.000032,1.000032,1.000032,1.000032,1.000032,0.446058,1.000032,1.000032,1.000032,...,1.000032,0.500015,0.386386,0.28162,0.106141,0.17888,0.039279,0.043906,0.397892,0.146325
min,0.0,-1.733416,-2.85076,-1.850147,-2.717984,-2.491966,0.0,-1.690972,-1.167485,-1.019279,...,-2.086168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3883.0,-0.673904,-0.7869668,-0.8261901,-0.8343106,0.3391301,1.0,-0.3807209,-1.15906,-1.019279,...,-0.7367637,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,7766.0,-0.184478,0.0009332655,-0.1435521,-0.1896799,0.3391301,1.0,-0.04545908,0.0275238,-0.07881107,...,-0.09974323,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,11649.0,0.3856079,0.7186589,0.9031804,1.049363,0.3391301,1.0,0.8290588,0.7238255,0.6412896,...,0.8116963,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
max,15532.0,6.566094,3.145359,2.930491,1.049363,1.754678,3.0,1.600054,2.417541,2.301858,...,2.980791,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
train_save_path = '/content/drive/MyDrive/ml_project/data/train_preprocessed.csv'
test_save_path = '/content/drive/MyDrive/ml_project/data/test_preprocessed.csv'
# Save train
train_df.to_csv(train_save_path, index=False)

# Save test
test_df.to_csv(test_save_path, index=False)
