In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('humanoid_robot_navigation_dataset.csv')
print(f"Размер датасета: {df_scaled.shape}")
print(df_scaled.info())




In [None]:

features = ['reward_score', 'navigation_efficiency', 'path_length', 'collisions']
X = df_scaled[features]
y = df_scaled['overall_performance_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Данные: {X.shape}")
print(f"Признаки: {features}")

# ОБУЧЕНИЕ МОДЕЛИ

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# ПРЕДСКАЗАНИЯ И МЕТРИКИ
y_pred = lr_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")


fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Фактические и Предсказанные
axes[0].scatter(y_test, y_pred, alpha=0.6)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Фактические значения')
axes[0].set_ylabel('Предсказанные значения')
axes[0].set_title('Linear Regression: Фактические и Предсказанные')
axes[0].grid(True, alpha=0.3)

# Важность признаков
coefficients = pd.DataFrame({
    'Признак': features,
    'Коэффициент': lr_model.coef_,
    'Абс.Значение': np.abs(lr_model.coef_)
}).sort_values('Абс.Значение', ascending=False)

coefficients['Абсолютное_значение'] = np.abs(coefficients['Коэффициент'])
axes[1].barh(coefficients['Признак'], coefficients['Абсолютное_значение'], color='skyblue')
axes[1].set_title('Влияние признаков на производительность\n(Абсолютные значения коэффициентов)')
axes[1].set_xlabel('Абсолютное значение коэффициента')

plt.tight_layout()
plt.show()



test_sizes = [0.1, 0.2, 0.3, 0.4]
lr_r2_scores = []
lr_train_samples = []
lr_test_samples = []

for size in test_sizes:
    X_train_exp, X_test_exp, y_train_exp, y_test_exp = train_test_split(
        X, y, test_size=size, random_state=42
    )


    lr_exp = LinearRegression()
    lr_exp.fit(X_train_exp, y_train_exp)
    r2_exp = r2_score(y_test_exp, lr_exp.predict(X_test_exp))

    lr_r2_scores.append(r2_exp)
    lr_train_samples.append(len(X_train_exp))
    lr_test_samples.append(len(X_test_exp))

    print(f"  test_size = {size}: R² = {r2_exp:.4f}")

feature_combinations_lr = {
    'Только collisions': ['collisions'],
    'collisions + navigation': ['collisions', 'navigation_efficiency'],
    'Все 4 признака': features
}

lr_feature_results = {}
for name, feat_list in feature_combinations_lr.items():
    X_subset = X[feat_list]
    X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
        X_subset, y, test_size=0.2, random_state=42
    )

    lr_exp = LinearRegression()
    lr_exp.fit(X_train_f, y_train_f)
    r2_f = r2_score(y_test_f, lr_exp.predict(X_test_f))
    lr_feature_results[name] = r2_f

    print(f"  {name}: R² = {r2_f:.4f}")

# ВИЗУАЛИЗАЦИЯ ДЛЯ LINEAR REGRESSION
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# График 1: Качество при разных соотношениях Train/Test
bars1 = ax1.bar([f"{int((1-size)*100)}/{int(size*100)}" for size in test_sizes],
                lr_r2_scores, color=['lightblue', 'lightgreen', 'gold', 'salmon'])
ax1.set_xlabel('Соотношение Train/Test', fontsize=12)
ax1.set_ylabel('R² Score', fontsize=12)
ax1.set_title('Linear Regression: Качество при разных\nсоотношениях Train/Test',
              fontsize=14, fontweight='bold')
ax1.set_ylim(min(lr_r2_scores) - 0.005, max(lr_r2_scores) + 0.005)
ax1.grid(True, alpha=0.3, axis='y')

for bar, score in zip(bars1, lr_r2_scores):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.001,
             f'{score:.4f}', ha='center', va='bottom', fontweight='bold')

# График 2: Качество при разных наборах признаков
x_labels_feat = list(lr_feature_results.keys())
scores_feat = list(lr_feature_results.values())
bars2 = ax2.bar(x_labels_feat, scores_feat, color=['lightcoral', 'lightseagreen', 'plum'])
ax2.set_xlabel('Набор признаков', fontsize=12)
ax2.set_ylabel('R² Score', fontsize=12)
ax2.set_title('Linear Regression: Качество при разных\nнаборах признаков',
              fontsize=14, fontweight='bold')
ax2.set_ylim(min(scores_feat) - 0.05, max(scores_feat) + 0.05)
ax2.tick_params(axis='x', rotation=45)
ax2.grid(True, alpha=0.3, axis='y')

for bar, score in zip(bars2, scores_feat):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 0.001,
             f'{score:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:


features = ['reward_score', 'navigation_efficiency', 'path_length', 'collisions']
X = df_scaled[features]
y = df_scaled['overall_performance_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


rf_base = RandomForestRegressor(n_estimators=100, random_state=42)
rf_base.fit(X_train, y_train)

y_pred_base = rf_base.predict(X_test)
r2_base = r2_score(y_test, y_pred_base)

print(f"Качество: R² = {r2_base:.4f}")

# ПОДБОР ГИПЕРПАРАМЕТРОВ

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(f"Лучшие параметры: {grid_search.best_params_}")
print(f"Лучшее качество: {grid_search.best_estimator_}")

# ОБУЧЕНИЕ С ЛУЧШИМИ ПАРАМЕТРАМИ
rf_tuned = grid_search.best_estimator_
y_pred_tuned = rf_tuned.predict(X_test)
r2_tuned = r2_score(y_test, y_pred_tuned)


print(f"ДО:   R² = {r2_base:.4f}")
print(f"ПОСЛЕ: R² = {r2_tuned:.4f}")
print(f"УЛУЧШЕНИЕ:       +{r2_tuned - r2_base:.4f}")


# ВИЗУАЛИЗАЦИЯ
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Фактические и Предсказанные значения
ax1.scatter(y_test, y_pred_tuned, alpha=0.7, color='blue', s=50)
ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
         'r--', lw=2)
ax1.set_xlabel('Фактическая производительность', fontsize=12)
ax1.set_ylabel('Предсказанная производительность', fontsize=12)
ax1.set_title('Сравнение фактических и предсказанных значений\nRandom Forest',
              fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.legend()

ax1.text(0.05, 0.95, f'R² = {r2_tuned:.4f}', transform=ax1.transAxes,
         fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

feature_importance = pd.DataFrame({
    'Признак': features,
    'Важность': rf_tuned.feature_importances_
}).sort_values('Важность', ascending=False)

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
bars = ax2.barh(feature_importance['Признак'], feature_importance['Важность'],
                color=colors, alpha=0.8)
ax2.set_xlabel('Важность признака', fontsize=12)
ax2.set_title('Влияние параметров роботов на производительность',
              fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='x')

for bar, importance in zip(bars, feature_importance['Важность']):
    ax2.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
             f'{importance:.3f}', ha='left', va='center', fontweight='bold')

plt.tight_layout()
plt.show()



#  Разные размеры тестовой выборки
print("Разные размеры тестовой выборки:")
test_sizes = [0.1, 0.2, 0.3, 0.4]
r2_scores = []
train_samples = []
test_samples = []

for size in test_sizes:
    X_train_exp, X_test_exp, y_train_exp, y_test_exp = train_test_split(
        X, y, test_size=size, random_state=42
    )
    rf_exp = RandomForestRegressor(**grid_search.best_params_)
    rf_exp.fit(X_train_exp, y_train_exp)
    r2_exp = r2_score(y_test_exp, rf_exp.predict(X_test_exp))

    r2_scores.append(r2_exp)
    train_samples.append(len(X_train_exp))
    test_samples.append(len(X_test_exp))

    print(f"  test_size = {size}: R² = {r2_exp:.4f}")

## Разные наборы признаков
print("\n2. РАЗНЫЕ НАБОРЫ ПРИЗНАКОВ:")
feature_combinations = {
    'Только collisions': ['collisions'],
    'collisions + navigation': ['collisions', 'navigation_efficiency'],
    'Все 4 признака': features
}

rf_feature_results = {}
for name, feat_list in feature_combinations.items():
    X_subset = X[feat_list]
    X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
        X_subset, y, test_size=0.2, random_state=42
    )
    rf_exp = RandomForestRegressor(**grid_search.best_params_)
    rf_exp.fit(X_train_f, y_train_f)
    r2_f = r2_score(y_test_f, rf_exp.predict(X_test_f))
    rf_feature_results[name] = r2_f
    print(f"  {name}: R² = {r2_f:.4f}")


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# График 1: Качество при разных размерах тестовой выборки
bars1 = ax1.bar([f"{int((1-size)*100)}/{int(size*100)}" for size in test_sizes],
                r2_scores, color=['lightblue', 'lightgreen', 'gold', 'salmon'])
ax1.set_xlabel('Соотношение Train/Test', fontsize=12)
ax1.set_ylabel('R² Score', fontsize=12)
ax1.set_title('Качество модели при разных соотношениях Train/Test',
              fontsize=14, fontweight='bold')
ax1.set_ylim(0.98, 1.0)
ax1.grid(True, alpha=0.3, axis='y')

for bar, score in zip(bars1, r2_scores):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.001,
             f'{score:.4f}', ha='center', va='bottom', fontweight='bold')

# График 2: Качество при разных наборах признаков
x_labels_feat = list(rf_feature_results.keys())
scores_feat = list(rf_feature_results.values())
bars2 = ax2.bar(x_labels_feat, scores_feat, color=['lightcoral', 'lightseagreen', 'plum'])
ax2.set_xlabel('Набор признаков', fontsize=12)
ax2.set_ylabel('R² Score', fontsize=12)
ax2.set_title('Random Forest: Качество при разных\nнаборах признаков',
              fontsize=14, fontweight='bold')
ax2.set_ylim(0.1, 1.0)
ax2.tick_params(axis='x', rotation=45)
ax2.grid(True, alpha=0.3, axis='y')

for bar, score in zip(bars2, scores_feat):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 0.001,
             f'{score:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:


# ПОДГОТОВКА ДАННЫХ
features = ['reward_score', 'navigation_efficiency', 'path_length', 'collisions']
X = df_scaled[features]
y = df_scaled['overall_performance_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



gb_base = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_base.fit(X_train, y_train)

y_pred_base = gb_base.predict(X_test)
r2_base = r2_score(y_test, y_pred_base)

print(f"Качество ДО: R² = {r2_base:.4f}")

#ПОДБОР ГИПЕРПАРАМЕТРОВ
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'subsample': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(
    GradientBoostingRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(f"Лучшие параметры: {grid_search.best_params_}")
print(f"Лучшее качество на кросс-валидации: {grid_search.best_score_:.4f}")

# ОБУЧЕНИЕ С ЛУЧШИМИ ПАРАМЕТРАМИ
gb_tuned = grid_search.best_estimator_
y_pred_tuned = gb_tuned.predict(X_test)
r2_tuned = r2_score(y_test, y_pred_tuned)

print(f"ДО:   R² = {r2_base:.4f}")
print(f"ПОСЛЕ: R² = {r2_tuned:.4f}")
print(f"УЛУЧШЕНИЕ:       +{r2_tuned - r2_base:.4f}")




print(feature_importance)


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Фактические и Предсказанные значения
ax1.scatter(y_test, y_pred_tuned, alpha=0.7, color='blue', s=50)
ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
         'r--', lw=2)
ax1.set_xlabel('Фактическая производительность', fontsize=12)
ax1.set_ylabel('Предсказанная производительность', fontsize=12)
ax1.set_title('Gradient Boosting: Фактические и Предсказанные значения',
              fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.legend()

ax1.text(0.05, 0.95, f'R² = {r2_tuned:.4f}', transform=ax1.transAxes,
         fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Важность признаков
feature_importance = pd.DataFrame({
    'Признак': features,
    'Важность': gb_tuned.feature_importances_
}).sort_values('Важность', ascending=False)

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
bars = ax2.barh(feature_importance['Признак'], feature_importance['Важность'],
                color=colors, alpha=0.8)
ax2.set_xlabel('Важность признака', fontsize=12)
ax2.set_title('Gradient Boosting: Влияние параметров на производительность',
              fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='x')


for bar, importance in zip(bars, feature_importance['Важность']):
    ax2.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
             f'{importance:.3f}', ha='left', va='center', fontweight='bold')

plt.tight_layout()
plt.show()



# Разные соотношения train/test
print("1. РАЗНЫЕ СООТНОШЕНИЯ TRAIN/TEST:")
test_sizes = [0.1, 0.2, 0.3, 0.4]
gb_size_results = []

for size in test_sizes:
    X_train_exp, X_test_exp, y_train_exp, y_test_exp = train_test_split(
        X, y, test_size=size, random_state=42
    )
    gb_exp = GradientBoostingRegressor(**grid_search.best_params_)
    gb_exp.fit(X_train_exp, y_train_exp)
    r2_exp = r2_score(y_test_exp, gb_exp.predict(X_test_exp))
    gb_size_results.append(r2_exp)
    print(f"  test_size = {size}: R² = {r2_exp:.4f}")


print("\n2. РАЗНЫЕ НАБОРЫ ПРИЗНАКОВ:")
feature_combinations = {
    'Только collisions': ['collisions'],
    'collisions + navigation': ['collisions', 'navigation_efficiency'],
    'Все 4 признака': features
}

gb_feature_results = {}
for name, feat_list in feature_combinations.items():
    X_subset = X[feat_list]
    X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
        X_subset, y, test_size=0.2, random_state=42
    )
    gb_exp = GradientBoostingRegressor(**grid_search.best_params_)
    gb_exp.fit(X_train_f, y_train_f)
    r2_f = r2_score(y_test_f, gb_exp.predict(X_test_f))
    gb_feature_results[name] = r2_f
    print(f"  {name}: R² = {r2_f:.4f}")


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Разные соотношения train/test
x_labels = [f"{int((1-size)*100)}/{int(size*100)}" for size in test_sizes]
bars1 = ax1.bar(x_labels, gb_size_results, color=['lightblue', 'lightgreen', 'gold', 'salmon'])
ax1.set_xlabel('Соотношение Train/Test', fontsize=12)
ax1.set_ylabel('R² Score', fontsize=12)
ax1.set_title('Gradient Boosting: Качество при разных\nсоотношениях Train/Test',
              fontsize=14, fontweight='bold')
ax1.set_ylim(0.98, 1.0)
ax1.grid(True, alpha=0.3, axis='y')

for bar, score in zip(bars1, gb_size_results):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.001,
             f'{score:.4f}', ha='center', va='bottom', fontweight='bold')

# Разные наборы признаков
x_labels_feat = list(gb_feature_results.keys())
scores_feat = list(gb_feature_results.values())
bars2 = ax2.bar(x_labels_feat, scores_feat, color=['lightcoral', 'lightseagreen', 'plum'])
ax2.set_xlabel('Набор признаков', fontsize=12)
ax2.set_ylabel('R² Score', fontsize=12)
ax2.set_title('Gradient Boosting: Качество при разных\nнаборах признаков',
              fontsize=14, fontweight='bold')
ax2.set_ylim(0.1, 1.0)
ax2.tick_params(axis='x', rotation=45)
ax2.grid(True, alpha=0.3, axis='y')

for bar, score in zip(bars2, scores_feat):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 0.001,
             f'{score:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()





