In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [17]:
data = pd.read_excel('data/data_ford_price.xlsx')[['price','year', 'cylinders', 'odometer', 'lat', 'long', 'weather']]
data.dropna(inplace=True)

X = data.drop(columns='price')
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

lr_model = LinearRegression()
# RFE selector
rfe_selector = RFE(lr_model, n_features_to_select=3, step=1)
rfe_selector.fit(X_train, y_train)
rfe_selected_cols = rfe_selector.get_feature_names_out()
# KBest selector
kbest_selector = SelectKBest(f_regression, k=3)
kbest_selector.fit(X_train, y_train)
kbest_selected_cols = kbest_selector.get_feature_names_out()

print('RFE selected features:', list(rfe_selected_cols))
print('KBest selected features:', list(kbest_selected_cols))

RFE selected features: ['year', 'cylinders', 'lat']
KBest selected features: ['year', 'cylinders', 'odometer']


In [18]:
# Функция, чтобы вывести метрики для регрессии в виде таблицы
def print_metrics(y_train_pred:list, y_test_pred:list, y_test=y_test):
    
    """Prints metrics for regression as DataFrame

    Args:
        y_train_pred (list): Train predicted samples
        y_test_pred (list): Test predicted samples
        y_test (list, optional): True samples. Defaults to y_test.
    """
    
    r2_train = r2_score(y_train, y_train_pred).round(3)
    mae_train = mean_absolute_error(y_train, y_train_pred).round(3)
    
    r2_test = r2_score(y_test, y_test_pred).round(3)
    mae_test = mean_absolute_error(y_test, y_test_pred).round(3)
    
    metrics_df = pd.DataFrame({
        'Train': [r2_train, mae_train],
        'Test': [r2_test, mae_test]
    }, index=['R2', 'MAE'])
    
    display(metrics_df)

# Разделяем выборки на RFE и KBest
X_train_rfe = X_train[rfe_selected_cols]
X_test_rfe = X_test[rfe_selected_cols]
X_train_kbest = X_train[kbest_selected_cols]
X_test_kbest = X_test[kbest_selected_cols]
# Обучаем модель на признаках от RFE
lr_model.fit(X_train_rfe, y_train)
# Делаем предсказания для RFE
y_train_pred_rfe = lr_model.predict(X_train_rfe)
y_test_pred_rfe = lr_model.predict(X_test_rfe)
# Обучаем на признаках от KBest
lr_model.fit(X_train_kbest, y_train)
# Делаем предсказания для KBest
y_train_pred_kbest = lr_model.predict(X_train_kbest)
y_test_pred_kbest = lr_model.predict(X_test_kbest)

print('RFE')
print_metrics(y_train_pred_rfe, y_test_pred_rfe)
print('---' * 8)
print('KBest')
print_metrics(y_train_pred_kbest, y_test_pred_kbest)

RFE


Unnamed: 0,Train,Test
R2,0.565,0.4
MAE,5179.883,5252.681


------------------------
KBest


Unnamed: 0,Train,Test
R2,0.606,0.405
MAE,4704.71,4910.266


### Вывод

На тестовой выборке метод ```KBest``` показал результат лучше, чем ```RFE```.

$R^{2}$ больше на $0.005$, а $MAE$ меньше на $342.4$.