In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


df = pd.read_csv("/kaggle/input/obesity-levels/ObesityDataSet_raw_and_data_sinthetic.csv")
df.head(3)
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age                             2111 non-null   float64
 1   Gender                          2111 non-null   object 
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   CALC                            2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   SCC                             2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  family_history_with_overweight  2111 non-null   object 
 12  FAF                             21

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [25]:
df["BMI"] = df["Weight"] / (df["Height"] ** 2)

In [26]:
num_features = ['Age', 'Height', 'Weight', 'FAF', 'TUE', 'CH2O', 'NCP', 'BMI']
df[num_features].describe()

Unnamed: 0,Age,Height,Weight,FAF,TUE,CH2O,NCP,BMI
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,1.010298,0.657866,2.008011,2.685628,29.700159
std,6.345968,0.093305,26.191172,0.850592,0.608927,0.612953,0.778039,8.011337
min,14.0,1.45,39.0,0.0,0.0,1.0,1.0,12.998685
25%,19.947192,1.63,65.473343,0.124505,0.0,1.584812,2.658738,24.325802
50%,22.77789,1.700499,83.0,1.0,0.62535,2.0,3.0,28.719089
75%,26.0,1.768464,107.430682,1.666678,1.0,2.47742,3.0,36.016501
max,61.0,1.98,173.0,3.0,2.0,3.0,4.0,50.811753


In [27]:
num_df = df.select_dtypes(include=["int64", "float64"])
corr = num_df.corr()
corr

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,BMI
Age,1.0,-0.025958,0.20256,0.016291,-0.043944,-0.045304,-0.144938,-0.296931,0.244163
Height,-0.025958,1.0,0.463136,-0.038121,0.243672,0.213376,0.294709,0.051912,0.131785
Weight,0.20256,0.463136,1.0,0.216125,0.107469,0.200575,-0.051436,-0.071561,0.934806
FCVC,0.016291,-0.038121,0.216125,1.0,0.042216,0.068461,0.019939,-0.101135,0.263651
NCP,-0.043944,0.243672,0.107469,0.042216,1.0,0.057088,0.129504,0.036326,0.039969
CH2O,-0.045304,0.213376,0.200575,0.068461,0.057088,1.0,0.167236,0.011965,0.1442
FAF,-0.144938,0.294709,-0.051436,0.019939,0.129504,0.167236,1.0,0.058562,-0.177537
TUE,-0.296931,0.051912,-0.071561,-0.101135,0.036326,0.011965,0.058562,1.0,-0.09972
BMI,0.244163,0.131785,0.934806,0.263651,0.039969,0.1442,-0.177537,-0.09972,1.0


In [34]:
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = df.select_dtypes(exclude=np.number).columns.drop('NObeyesdad').tolist()


In [29]:
X = df.drop(columns=["NObeyesdad"])
y = df["NObeyesdad"]

In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [37]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols)
    ]
)

In [45]:
models = {
    'LogReg': LogisticRegression(max_iter=1000, random_state=42),
    'DT': DecisionTreeClassifier(max_depth=5, random_state=42),
}

In [46]:
results = []
for name, model in models.items():
    pipe = Pipeline([('preprocess', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    results.append({'Модель': name, 'Accuracy': acc, 'F1_macro': f1})
    
    print(f"{name}: acc={acc:.4f}, f1={f1:.4f}")

LogReg: acc=0.9102, f1=0.9077
DT: acc=0.9693, f1=0.9681


In [48]:
results_df = pd.DataFrame(results).round(4)
results_df

Unnamed: 0,Модель,Accuracy,F1_macro
0,LogReg,0.9102,0.9077
1,DT,0.9693,0.9681
