In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('Walmart_sales.csv')
data

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


In [4]:
print(data.isnull().sum())
data.drop(["Date"], axis=1, inplace=True)

Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64


In [5]:
# разделение на признаки и целевую переменную
X = data.drop(["Weekly_Sales"], axis=1)
y = data["Weekly_Sales"]

In [6]:
from sklearn.model_selection import train_test_split

# разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# проверка размеров выборок
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (5148, 6)
X_test shape: (1287, 6)
y_train shape: (5148,)
y_test shape: (1287,)


In [7]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
# создание экземпляра класса KNeighborsRegressor с K=5
knn = KNeighborsRegressor(n_neighbors=5)

# обучение модели
knn.fit(X_train, y_train)

# прогнозирование на тестовой выборке
y_pred = knn.predict(X_test)

# MAE - средняя абсолютная ошибка
mae = mean_absolute_error(y_test, y_pred)
# RMSE - среднеквадратичная ошибка (более чувствителен к наблюдением далеким от среднего)
rmse = mean_squared_error(y_test, y_pred, squared=False)
# MAPE - средняя абсолютная ошибка в процентах
mape = mean_absolute_percentage_error(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("MAPE:", mape)

MAE: 191597.05669308468
RMSE: 301749.621612451
MAPE: 0.24296767380093


In [11]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, ShuffleSplit
from sklearn.metrics import mean_absolute_error, make_scorer
knn = KNeighborsRegressor()

# задание сетки гиперпараметров для GridSearchCV
param_grid = {'n_neighbors': [5, 10, 15, 20]}

# задание сетки гиперпараметров для RandomizedSearchCV
param_dist = {'n_neighbors': [5, 10, 15, 20]}

# задание стратегий кросс-валидации
cv_kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_shuffle = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

# подбор гиперпараметра K с помощью GridSearchCV и оценка качества с помощью кросс-валидации
grid_search = GridSearchCV(knn, param_grid, scoring=make_scorer(mean_absolute_percentage_error), cv=cv_kfold)
grid_search.fit(X_train, y_train)

print("Best parameters (GridSearchCV):", grid_search.best_params_)
print("MAPE (GridSearchCV):", grid_search.best_score_)

# подбор гиперпараметра K с помощью RandomizedSearchCV и оценка качества с помощью кросс-валидации
random_search = RandomizedSearchCV(knn, param_dist, scoring=make_scorer(mean_absolute_percentage_error), cv=cv_shuffle)
random_search.fit(X_train, y_train)

print("Best parameters (RandomizedSearchCV):", random_search.best_params_)
print("MAPE (RandomizedSearchCV):", random_search.best_score_)

Best parameters (GridSearchCV): {'n_neighbors': 20}
MAPE (GridSearchCV): 0.40404523695799066
Best parameters (RandomizedSearchCV): {'n_neighbors': 20}
MAPE (RandomizedSearchCV): 0.4079354283992271


