![image.png](attachment:image.png)

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import math
from itertools import combinations

In [2]:
df = pd.read_csv("gym_members_exercise_tracking.csv")
df.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.7,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39


In [3]:
# Кодируем категориальные признаки 
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Workout_Type'] = label_encoder.fit_transform(df['Workout_Type'])
df['Experience_Level'] = label_encoder.fit_transform(df['Experience_Level'])

### Масштабирование

In [4]:
numerical_features = ['Age', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)',
                      'Calories_Burned', 'Workout_Type', 'Fat_Percentage', 'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'BMI']

In [5]:
scaler = StandardScaler()
scaler.fit(df[numerical_features]) # Обучаем StandardScaler на numerical_features
df[numerical_features] = scaler.transform(df[numerical_features]) # масштабируем числовые признаки

### Разделяем датасет

In [6]:
X = df.drop(['Experience_Level'], axis=1)
y_experience = df['Experience_Level']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y_experience, test_size=0.2, random_state=42)
print('Train size:', X_train.shape[0])
print('Test_size:',X_test.shape[0])

Train size: 778
Test_size: 195


### Обучение ансамблевых моделей

In [16]:
bagging_model = BaggingRegressor(random_state=52, n_estimators=100, n_jobs=-1)
random_forest_model = RandomForestRegressor(random_state=52, n_estimators=100, n_jobs=-1)
extra_trees_model = ExtraTreesRegressor(random_state=52, n_estimators=100, n_jobs=-1)

adaboost_model = AdaBoostRegressor(random_state=52, n_estimators=100)

gradient_boosting_model = GradientBoostingRegressor(random_state=42, n_estimators=100)

models = {
    "Bagging": bagging_model,
    "Random Forest": random_forest_model,
    "Extra Trees": extra_trees_model,
    "AdaBoost": adaboost_model,
    "Gradient Boosting": gradient_boosting_model
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    results[name] = y_pred

### Функция для расчета Somers'D

In [15]:
def calculate_somers_d(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    n_concordant = 0
    n_discordant = 0
    n_tied_y = 0

    indices = list(combinations(range(len(y_true)), 2))

    for i, j in indices:
        y_true_diff = y_true[i] - y_true[j]
        y_pred_diff = y_pred[i] - y_pred[j]

        if y_true_diff != 0:
            if y_pred_diff != 0:
                if np.sign(y_true_diff) == np.sign(y_pred_diff):
                    n_concordant += 1
                else:
                    n_discordant += 1
        else:
             n_tied_y +=1

    if n_concordant + n_discordant + n_tied_y == 0:
        return 0
    else:
        return (n_concordant - n_discordant) / (n_concordant + n_discordant + n_tied_y)


### Считаем метрики моделей

In [11]:
metrics_results = {}

for name, data in results.items():
    y_pred = data

    # Расчет метрик
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = math.sqrt(mse)
    somers_d = calculate_somers_d(y_test, y_pred)


    metrics_results[name] = {
        'R^2': r2,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'Somers\' D': somers_d,
    }

### Сравниваем метрики

In [12]:
metrics_df = pd.DataFrame(metrics_results).T
metrics_df = metrics_df[['R^2', 'MAE', 'MSE', 'RMSE', 'Somers\' D']]
print(metrics_df.round(4))

                      R^2     MAE     MSE    RMSE  Somers' D
Bagging            0.9038  0.1108  0.0559  0.2364     0.6222
Random Forest      0.9000  0.1128  0.0582  0.2412     0.6194
Extra Trees        0.8942  0.1193  0.0615  0.2480     0.6172
AdaBoost           0.8573  0.1610  0.0830  0.2880     0.6304
Gradient Boosting  0.8973  0.1428  0.0597  0.2443     0.6193
Лучшая модель по R^2, MAE, RMSE: Extra Trees
Лучшая модель по Somers' D: Gradient Boosting


In [14]:
print("Лучшая модель по R^2, MAE, RMSE: Bagging")
print("Лучшая модель по Somers' D: AdaBoost")

Лучшая модель по R^2, MAE, RMSE: Bagging
Лучшая модель по Somers' D: AdaBoost


### Результаты из 4 Лабы

![image.png](attachment:image.png)

### Итоги

Ансамблевые модели показали себя значительно лучше линейной регрессии, SVR и обычного дерева решений