In [1]:
import optuna
from optuna.trial import TrialState

import pandas as pd
import torch
import datetime

from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn
from torch import optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import utils

import warnings
warnings.filterwarnings('ignore')

In [2]:
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))

# Предобработка данных

In [3]:
class MyDataset(Dataset):
    def __init__(self, x: torch.Tensor, y: torch.tensor):
        self.x = x
        self.y = y

    def __getitem__(self, index):
        data = self.x[index]
        label = self.y[index]
        return data, label

    def __len__(self):
        return len(self.x)

In [4]:
dataframe = pd.read_csv("house_price.csv")
OUTPUT_COLUMN = 'Цена'

# Категориальные признаки уже закодированы, однако всё еще необходимо удалить по одному столбцу для каждого закодированного признака
feature_groups = [['Без.ремонта', 'Дизайнерский', 'Евроремонт', 'Косметический'],
                  ['Балкон', 'Лоджия'], 
                  ['Железобетонные.перекрытия', 'Смешанные.перекрытия','Деревянные.перекрытия', 'Иные.перекрытия'],
                  ['Панельные.стены', 'Блочные.стены', 'Деревянные.стены', 'Кирпичные.стены', 'Монолитные.стены', 'Смешанные.стены']]
to_drop = [group[-1] for group in feature_groups]

feature_groups = [group[:-1] for group in feature_groups]
numeric = ['Количество.комнат', 'Общая.площадь', 'Этаж', 'Этажей.в.доме','Школа.1000', 'ВУЗ.1000',
       'ТЦ.1000', 'Стоматология.1000', 'Почта.1000', 'Поликлиника.1000',
       'Парк.1000', 'Остановка.1000', 'Одежда..1000', 'Супер3кет.1000',
       'Кинотеатр.1000', 'Кафе.1000', 'АЗС.1000', 'Детский.сад.1000',
       'Бар.1000', 'Банк.1000', 'Аптека.1000']



# Сохраняем оставшиеся категориальные данные
categorical = []

[categorical.extend(group) for group in feature_groups]

dataframe = utils.preprocess_df(dataframe, OUTPUT_COLUMN, categorical, numeric, to_drop=to_drop)
dataframe.head(5)

Unnamed: 0,id,Количество.комнат,Студия,Общая.площадь,Этаж,Этажей.в.доме,Парковка,Совмещенный.санузел,Раздельный.санузел,Лифт,...,Балкон_3,Балкон_4,Железобетонные.перекрытия_1,Смешанные.перекрытия_1,Деревянные.перекрытия_1,Панельные.стены_1,Блочные.стены_1,Деревянные.стены_1,Кирпичные.стены_1,Монолитные.стены_1
0,896,0.25,0,0.327273,0.117647,0.125,0,1,0,0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,399,0.25,0,0.314545,0.235294,0.125,0,1,0,0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1515,0.25,0,0.390909,0.0,0.291667,0,0,0,1,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,677,0.0,0,0.211818,0.0,0.333333,0,1,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1485,0.0,0,0.318182,0.411765,0.291667,0,1,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
# Dataframe в pytorch датасет
x = torch.Tensor(dataframe.drop(columns=OUTPUT_COLUMN).to_numpy()).to(device=device, dtype=torch.float)

#y = MinMaxScaler().fit_transform(dataframe[[OUTPUT_COLUMN]].to_numpy())
#y = torch.Tensor(y).to(device=device, dtype=torch.float)
y = torch.tensor(dataframe[[OUTPUT_COLUMN]].to_numpy()).to(device=device, dtype=torch.float)

dataset = MyDataset(x, y)
train_dataset, validation_dataset, test_dataset = random_split(dataset, (0.8, 0.1, 0.1))
train_dataloader, validation_dataloader, test_dataloader = DataLoader(train_dataset, batch_size=8), DataLoader(validation_dataset, batch_size=len(dataset)), DataLoader(test_dataset, batch_size=len(dataset))


# Подготовка модели

In [6]:
class MyModel(nn.Module):
    """
    Класс для модели многослойного персептрона
    """
    def __init__(self, n_layers, in_features, hidden_features, out_features):
        super().__init__()
        if n_layers < 1:
            raise Exception("Incorrect number of layers")
        self.layers = []
        if n_layers == 1:
            self.layers.append(nn.Linear(in_features, out_features))
            
        else:
            for i in range(n_layers):
                if i == 0: # Первый слой
                    self.layers.append(nn.Linear(in_features, hidden_features))
                    self.layers.append(nn.ReLU())
                elif i == n_layers - 1: # Последний слой
                    self.layers.append(nn.Linear(hidden_features, out_features))
                else: # Промежуточные слои
                    self.layers.append(nn.Linear(hidden_features, hidden_features))
                    self.layers.append(nn.ReLU())
                
        self.run = nn.Sequential(*self.layers)
        
    def forward(self, x):
        return self.run(x)

Подбор параметров

In [17]:
def objective(trial: optuna.Trial):

    in_features = len(dataframe.columns) - 1 # Один из столбцов таблицы является выходным 
    out_features = 1 
    hidden_features = trial.suggest_int("hidden_features", 5, 50, step=5)
    n_layers = trial.suggest_int("n_layers", 1, 4)
    model = MyModel(n_layers, in_features, hidden_features, out_features).to(device=device)
    
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=1e-2)
    #optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=30)
    loss_fn = torch.nn.CrossEntropyLoss()
    N_EPOCH = 100
    for epoch in range(N_EPOCH):
        model.train()
        for data, label in train_dataloader:
            output = model(data)
            loss = loss_fn(output, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        lr_scheduler.step()
        model.eval()

        for data, label in validation_dataloader:
                  
            output = model(data)
            validation_score = utils.get_regression_metrics(label.argmax(1).cpu(), output.argmax(1).cpu())['R2']

        
        trial.report(validation_score, epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
    return validation_score

In [18]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=60)

[I 2024-04-30 10:48:29,325] A new study created in memory with name: no-name-1d0a2c66-42fe-41fc-a5e7-d505d64c58a0
[W 2024-04-30 10:48:31,821] Trial 0 failed with parameters: {'hidden_features': 5, 'n_layers': 1, 'optimizer': 'RMSprop'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\admin\AppData\Local\Temp\ipykernel_9308\2241653438.py", line 21, in objective
    loss.backward()
  File "c:\Users\admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\_tensor.py", line 522, in backward
    torch.autograd.backward(
  File "c:\Users\admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\autograd\__init__.py", line 266, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to 

KeyboardInterrupt: 

In [None]:
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])
print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Study statistics: 
  Number of finished trials:  60
  Number of pruned trials:  0
  Number of complete trials:  60
Best trial:
  Value:  1.0
  Params: 
    hidden_features: 25
    n_layers: 1
    optimizer: RMSprop


In [15]:
def training_loop(n_epochs, optimizer, scheduler, model, loss_fn, train_loader, validation_loader, ):
    best_score = 0
    counter = 0
    for epoch in range(n_epochs):
        for data, label in train_loader:
            output = model(data)
            
            loss = loss_fn(output, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        scheduler.step()
        
        for data, label in validation_loader:
                  
            output = model(data)
            
            validation_score = utils.get_regression_metrics(label.argmax(1).cpu(), output.argmax(1).cpu())['RMSE']



        if best_score < validation_score:
            counter += 1
            if counter == 5:
                print(f"Early stop on epoch {epoch}")
                break
        else:
            counter = 0
            best_score = validation_score

        if epoch == 1 or epoch % 5 == 0:

            print('{} Epoch {}, Training loss {}, Validation accuracy {}, lr {}'.format(
                datetime.datetime.now(),
                epoch,
                loss / len(train_loader),
                validation_score,
                scheduler.get_last_lr())
            )

In [16]:
in_features = len(dataframe.columns) - 1 # Один из столбцов таблицы является выходным
print(f"Число входных признаков - {in_features}")
out_features = 1
print(f"Число выходных признаков - {out_features}")
# Гиперпараметры модели, полученные выше
n_layers = 1 
hidden_features = 50
N_EPOCH = 100

model = MyModel(n_layers, in_features, hidden_features, out_features).to(device=device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-7)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=30)
loss_fn = torch.nn.MSELoss()

training_loop(N_EPOCH, optimizer, lr_scheduler, model, loss_fn, train_dataloader, validation_dataloader)
torch.save(model.state_dict(), f"MyModel.pt")

Число входных признаков - 46
Число выходных признаков - 1
2024-04-30 10:49:42.357287 Epoch 0, Training loss 471887904.0, Validation accuracy 0.0, lr [1e-07]
2024-04-30 10:49:42.486779 Epoch 1, Training loss 483495488.0, Validation accuracy 0.0, lr [1e-07]
2024-04-30 10:49:43.060712 Epoch 5, Training loss 484618368.0, Validation accuracy 0.0, lr [1e-07]
2024-04-30 10:49:43.670721 Epoch 10, Training loss 484688096.0, Validation accuracy 0.0, lr [1e-07]
2024-04-30 10:49:44.266707 Epoch 15, Training loss 484751232.0, Validation accuracy 0.0, lr [1e-07]
2024-04-30 10:49:44.938718 Epoch 20, Training loss 484822304.0, Validation accuracy 0.0, lr [1e-07]
2024-04-30 10:49:45.573712 Epoch 25, Training loss 484888096.0, Validation accuracy 0.0, lr [1e-07]
2024-04-30 10:49:46.178710 Epoch 30, Training loss 2612913920.0, Validation accuracy 0.0, lr [1e-08]
2024-04-30 10:49:46.803757 Epoch 35, Training loss 2630668032.0, Validation accuracy 0.0, lr [1e-08]
2024-04-30 10:49:47.436778 Epoch 40, Traini

In [17]:
results = {}

In [19]:
# Размер батча для тестового даталоадера равен длине датасета, поэтому в цикле будет всего один проход
for data, label in test_dataloader:
    predicted = model(data)
    print(label.mean(), predicted.mean())
    #print(label == predicted)
    score = utils.get_regression_metrics(label.cpu().detach(), predicted.cpu().detach())
    results['Многослойный персептрон'] = score['R2']
    print(score)

tensor(2517391.2500, device='cuda:0') tensor(2560087.7500, device='cuda:0', grad_fn=<MeanBackward0>)
{'RMSE': 1064637.1, 'R2': 0.013109792675891296}
