# Задание

- Выберите набор данных (датасет) для решения задачи классификации или регресии.
- В случае необходимости проведите удаление или заполнение пропусков и кодирование категориальных признаков.
- С использованием метода train_test_split разделите выборку на обучающую и тестовую.
- Обучите следующие ансамблевые модели:
  - две модели группы бэггинга (бэггинг или случайный лес или сверхслучайные деревья);
  - AdaBoost;
  - градиентный бустинг.
- Оцените качество моделей с помощью одной из подходящих для задачи метрик.Сравните качество полученных моделей.

In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="ticks")

In [6]:
data = pd.read_csv('Car_sales.csv')

In [7]:
data.shape, data.Price_in_thousands.shape

((157, 16), (157,))

In [8]:
data.dtypes

Manufacturer            object
Model                   object
Sales_in_thousands     float64
__year_resale_value    float64
Vehicle_type            object
Price_in_thousands     float64
Engine_size            float64
Horsepower             float64
Wheelbase              float64
Width                  float64
Length                 float64
Curb_weight            float64
Fuel_capacity          float64
Fuel_efficiency        float64
Latest_Launch           object
Power_perf_factor      float64
dtype: object

In [9]:
data.isnull().sum()

Manufacturer            0
Model                   0
Sales_in_thousands      0
__year_resale_value    36
Vehicle_type            0
Price_in_thousands      2
Engine_size             1
Horsepower              1
Wheelbase               1
Width                   1
Length                  1
Curb_weight             2
Fuel_capacity           1
Fuel_efficiency         3
Latest_Launch           0
Power_perf_factor       2
dtype: int64

In [10]:
data = data.dropna(axis=0, how='any')
(data.shape, data.Price_in_thousands.shape)

((117, 16), (117,))

In [11]:
data.head()


Unnamed: 0,Manufacturer,Model,Sales_in_thousands,__year_resale_value,Vehicle_type,Price_in_thousands,Engine_size,Horsepower,Wheelbase,Width,Length,Curb_weight,Fuel_capacity,Fuel_efficiency,Latest_Launch,Power_perf_factor
0,Acura,Integra,16.919,16.36,Passenger,21.5,1.8,140.0,101.2,67.3,172.4,2.639,13.2,28.0,2/2/2012,58.28015
1,Acura,TL,39.384,19.875,Passenger,28.4,3.2,225.0,108.1,70.3,192.9,3.517,17.2,25.0,6/3/2011,91.370778
3,Acura,RL,8.588,29.725,Passenger,42.0,3.5,210.0,114.6,71.4,196.6,3.85,18.0,22.0,3/10/2011,91.389779
4,Audi,A4,20.397,22.255,Passenger,23.99,1.8,150.0,102.6,68.2,178.0,2.998,16.4,27.0,10/8/2011,62.777639
5,Audi,A6,18.78,23.555,Passenger,33.95,2.8,200.0,108.7,76.1,192.0,3.561,18.5,22.0,8/9/2011,84.565105


### Кодирование категориальных признаков

In [12]:
Vehicle_type_dict = {'Passenger': 0, 'Car': 1}
# ['Acura', 'Audi', 'BMW', 'Buick', 'Cadillac', 'Chevrolet',
#        'Chrysler', 'Dodge', 'Ford', 'Honda', 'Hyundai', 'Infiniti',
#        'Jeep', 'Lexus', 'Lincoln', 'Mitsubishi', 'Mercury', 'Mercedes-B',
#        'Nissan', 'Oldsmobile', 'Plymouth', 'Pontiac', 'Porsche', 'Saturn',
#        'Toyota', 'Volkswagen']
Manufacturer_type_dict  = {
    'Acure': 0, 'Audi': 1, 'BMW': 2, 'Buick': 3, 'Cadillac': 4, 'Chevrolet': 5,
    'Chrysler': 6, 'Dodge': 7, 'Ford': 8, 'Honda': 9, 'Hyundai': 10, 'Infiniti': 11,
    'Jeep': 12, 'Lexus': 13, 'Lincoln': 14, 'Mitsubishi': 15, 'Mercury': 16, 'Mercedes-B': 17,
    'Nissan': 18, 'Oldsmobile': 19, 'Plymouth': 20, 'Pontiac': 21, 'Porsche': 22, 'Saturn': 23,
    'Toyota': 24, 'Volkswagen': 25
}
data['Vehicle_type'] = data['Vehicle_type'].map(Vehicle_type_dict)
data['Manufacturer'] = data['Manufacturer'].map(Manufacturer_type_dict)
data = data.dropna(axis=0, how='any')
df_encoded = data.drop(columns=[ 'Model', 'Vehicle_type', 'Latest_Launch', '__year_resale_value', 'Power_perf_factor'])
print(df_encoded)

     Manufacturer  Sales_in_thousands  Price_in_thousands  Engine_size  \
4             1.0              20.397               23.99          1.8   
5             1.0              18.780               33.95          2.8   
6             1.0               1.380               62.00          4.2   
8             2.0               9.231               33.40          2.8   
9             2.0              17.527               38.90          2.8   
..            ...                 ...                 ...          ...   
145          25.0               9.761               14.90          2.0   
146          25.0              83.721               16.70          2.0   
147          25.0              51.102               21.20          1.8   
148          25.0               9.569               19.99          2.0   
149          25.0               5.596               17.50          2.0   

     Horsepower  Wheelbase  Width  Length  Curb_weight  Fuel_capacity  \
4         150.0      102.6   68.2   17

### Разделим выборку на обучающую и тестовую

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = df_encoded.drop("Price_in_thousands", axis=1)
y = df_encoded["Price_in_thousands"]

In [15]:
print(X.head(), "\n")
print(y.head())

   Manufacturer  Sales_in_thousands  Engine_size  Horsepower  Wheelbase  \
4           1.0              20.397          1.8       150.0      102.6   
5           1.0              18.780          2.8       200.0      108.7   
6           1.0               1.380          4.2       310.0      113.0   
8           2.0               9.231          2.8       193.0      107.3   
9           2.0              17.527          2.8       193.0      111.4   

   Width  Length  Curb_weight  Fuel_capacity  Fuel_efficiency  
4   68.2   178.0        2.998           16.4             27.0  
5   76.1   192.0        3.561           18.5             22.0  
6   74.0   198.2        3.902           23.7             21.0  
8   68.5   176.0        3.197           16.6             24.0  
9   70.9   188.0        3.472           18.5             25.0   

4    23.99
5    33.95
6    62.00
8    33.40
9    38.90
Name: Price_in_thousands, dtype: float64


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [17]:
X_train.shape, y_train.shape,

((85, 10), (85,))

In [18]:
X_test.shape, y_test.shape

((29, 10), (29,))

### Обучение моделей

In [19]:
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [20]:
bagging_regressor = BaggingRegressor()
bagging_regressor.fit(X_train, y_train)

In [21]:
random_forest_regressor = RandomForestRegressor()
random_forest_regressor.fit(X_train, y_train)

In [22]:
ada_boost_regressor = AdaBoostRegressor()
ada_boost_regressor.fit(X_train, y_train)

In [23]:
gradient_boosting_regressor = GradientBoostingRegressor()
gradient_boosting_regressor.fit(X_train, y_train)

In [24]:
bagging_pred = bagging_regressor.predict(X_test)
print(bagging_pred)

[16.6631 21.0245 32.3974 15.1034 26.332  28.5821 13.2256 14.0775 30.3606
 21.8373 19.8139 13.0058 22.6653 20.8429 16.8494 17.162  28.2425 15.6656
 15.6125 29.6958 13.9751 28.2109 12.4666 23.7904 30.9978 47.8763 14.0963
 20.9205 23.9132]


In [25]:
random_forest_pred = random_forest_regressor.predict(X_test)
print(random_forest_pred)

[16.96608 21.65924 31.9114  13.26205 25.37654 30.35275 12.59408 13.08332
 31.78131 20.73595 22.09508 13.48998 20.76987 20.17837 16.61806 17.96757
 29.60447 15.26387 15.87987 29.73085 13.56286 29.26601 12.40249 21.18563
 30.67688 42.20295 13.14045 19.99438 24.23338]


In [26]:
ada_boost_pred = ada_boost_regressor.predict(X_test)
print(ada_boost_pred)

[17.32875    24.61983333 30.98789189 15.33828571 26.05996875 32.53069388
 14.3031     15.33828571 30.98789189 22.31560606 20.70016667 14.3031
 21.03375    21.042      17.27113043 17.4945     27.58981818 16.59505882
 17.27113043 30.1401     14.4424     32.53069388 14.3031     24.23315
 32.53069388 41.14       15.33828571 23.60003571 24.61983333]


In [27]:
gradient_boosting_pred = gradient_boosting_regressor.predict(X_test)
print(gradient_boosting_pred)

[15.6351854  23.77923314 31.33207533 13.13440987 25.68217876 34.46813165
 13.68556029 12.81050353 29.14734845 19.80144071 22.28471197 13.62039386
 19.11444688 20.06882514 15.80717878 16.97575516 30.55903661 15.32917724
 15.33791425 29.95883928 13.50166152 30.80957968 12.01164197 20.0252809
 31.99776275 43.1698711  13.75350873 19.20789889 22.59031797]


In [28]:
bagging_mse = mean_squared_error(y_test, bagging_pred)
random_forest_mse = mean_squared_error(y_test, random_forest_pred)
ada_boost_mse = mean_squared_error(y_test, ada_boost_pred)
gradient_boosting_mse = mean_squared_error(y_test, gradient_boosting_pred)

In [29]:
bagging_r2_score = r2_score(y_test, bagging_pred)
random_forest_r2_score = r2_score(y_test, random_forest_pred)
ada_boost_r2_score = r2_score(y_test, ada_boost_pred)
gradient_boosting_r2_score = r2_score(y_test, gradient_boosting_pred)

In [30]:
print("Bagging MSE:", bagging_mse)
print("Random Forest MSE:", random_forest_mse)
print("AdaBoost MSE:", ada_boost_mse)
print("Gradient Boosting MSE:", gradient_boosting_mse)

Bagging MSE: 34.44663696241378
Random Forest MSE: 28.8095443420414
AdaBoost MSE: 24.92749606750022
Gradient Boosting MSE: 22.376454227176055


In [31]:
print("Bagging r2_score:", bagging_r2_score)
print("Random Forest r2_score:", random_forest_r2_score)
print("AdaBoost r2_score:", ada_boost_r2_score)
print("Gradient Boosting r2_score:", gradient_boosting_r2_score)

Bagging r2_score: 0.6737417447337106
Random Forest r2_score: 0.7271329656271739
AdaBoost r2_score: 0.7639014402475933
Gradient Boosting r2_score: 0.7880634059234769


## Часть 2
### Задание

- Обучите следующие ансамблевые модели:
  - одну из моделей группы стекинга.
  - модель многослойного персептрона. По желанию, вместо библиотеки scikit-learn возможно использование библиотек TensorFlow, PyTorch или других аналогичных библиотек.
  - двумя методами на выбор из семейства МГУА (один из линейных методов COMBI / MULTI + один из нелинейных методов MIA / RIA) с использованием библиотеки gmdh.
- Оцените качество моделей с помощью одной из подходящих для задачи метрик. Сравните качество полученных моделей.

Стекинг

In [32]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor

In [33]:
base_models = [
    ('linear_regression', LinearRegression()),
    ('gradient_boosting', GradientBoostingRegressor()),
    ('random_forest', RandomForestRegressor())
]

In [34]:
meta_model = LinearRegression()

In [35]:
stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=meta_model)

In [36]:
stacking_regressor.fit(X_train, y_train)

In [37]:
stacking_pred = stacking_regressor.predict(X_test)

In [38]:
stacking_mse = mean_squared_error(y_test, stacking_pred)
print("Stacking Regressor MSE:", stacking_mse)

Stacking Regressor MSE: 15.969649135696319


In [39]:
stacking_r2_score = r2_score(y_test, stacking_pred)
print("Stacking Regressor R²:", stacking_r2_score)

Stacking Regressor R²: 0.8487448899608925


### Модель многослойного персептрона (MLP)

In [40]:
from sklearn.neural_network import MLPRegressor

In [41]:
mlp_regressor = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', random_state=1)

In [42]:
mlp_regressor.fit(X_train, y_train)

In [43]:
mlp_pred = mlp_regressor.predict(X_test)

In [44]:
mlp_mse = mean_squared_error(y_test, mlp_pred)
print("MLP Regressor MSE:", mlp_mse)

MLP Regressor MSE: 155.19798239355126


In [45]:
mlp_r2_score = r2_score(y_test, mlp_pred)
print("MLP Regressor R²:", mlp_r2_score)

MLP Regressor R²: -0.46994387323842246


### МГУА 

In [51]:
from gmdh import Combi, split_data