<a href="https://colab.research.google.com/github/dave502/PyTorch/blob/main/HW_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Создать Dataset для загрузки данных (используем только числовые данные)
2. Обернуть его в Dataloader
3. Написать архитектуру сети, которая предсказывает число показов на основании числовых данных (вы всегда можете нагенерить дополнительных факторов). Сеть должна включать BatchNorm слои и Dropout (или НЕ включать, но нужно обосновать)
4. Учить будем на функцию потерь с кагла (log RMSE) - нужно её реализовать
5. Сравните сходимость Adam, RMSProp и SGD, сделайте вывод по качеству работы модели

train-test разделение нужно сделать с помощью sklearn random_state=13, test_size = 0.25

Вопросы? в личку @Kinetikm

In [None]:
# !pip install --upgrade wandb
# !pip install --upgrade git+https://github.com/PytorchLightning/pytorch-lightning.git

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch import Tensor
import pandas as pd
import numpy as np
from collections import namedtuple
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
from pathlib import Path
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# df_path = "data/df_train.csv"
df_path = "/content/drive/MyDrive/MLData/df_train.csv"

*Этот код загрузки датасета и разделения его на обучающий и тестовый писался для Jupyter, для колаба я уже не стал его переписывать*

In [None]:
# %%capture
# !pip install kaggle
# from kaggle.api.kaggle_api_extended import KaggleApi
# api = KaggleApi()
# api.authenticate()
# api.competition_download_file('avito-demand-prediction', 'train.csv', path='data/')

In [None]:
# if not Path(df_path).is_file():
#     data = pd.read_csv('data/train.csv.zip')
    
#     X_train, X_test = train_test_split(data, test_size = 0.25, random_state=13)
#     print(f'{X_train.shape=} {X_test.shape=}')

#     cat_columns=['region', 'parent_category_name', 'user_type']
#     X_train_mod = pd.get_dummies(X_train, columns=cat_columns) 
    
#     X_train_mod.to_csv(df_path)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, path):
        data = pd.read_csv(path)
        # находим числовые колонки
        numerics_colnames = data.select_dtypes(include=np.number).columns.tolist()
        # убираем текстовые колонки
        data.drop(data.columns.difference(numerics_colnames), 1, inplace=True)
        # запоняем пропуски в числовых колонках
        data.apply(pd.to_numeric, errors='coerce')
        data.fillna(0, inplace=True)
        # разделяем обучающий датасет на фичи и целевую переменную 
        self.X_train = torch.FloatTensor(data[data.columns.difference(['deal_probability'])].values)
        self.y_train = torch.FloatTensor(data['deal_probability'])

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, index):
        return self.X_train[index],  self.y_train[index]

    def dim(self):
      """
      возвращает количество фичей (input size для модели)
      """
        return self.X_train.size()[1]    

In [None]:
train_ds = Dataset(df_path)

In [None]:
train_loader = torch.utils.data.DataLoader(dataset=train_ds, batch_size=64, num_workers=10, drop_last=True)

код нейронной сети со второго дз с небольшими изменениями состава слоёв

In [None]:
class Perceptron(nn.Module):
    
    # tuple с активационными функциями (tuple для неизменности)
    Activatons = namedtuple('Activatons', 'relu silu leaky_relu sigmoid softmax')
    activations = Activatons(F.relu, F.silu, F.leaky_relu, F.sigmoid, F.softmax)
    
    def __init__(self, input_dim, output_dim, activation='relu', activation_params=[]):
        super(Perceptron, self).__init__()
        #fc - полносвязный слой
        self.fc = nn.Linear(input_dim, output_dim)
        #ac - функция активации
        self.ac = getattr(self.activations, activation)
        #ac_params - параметры функции активации
        self.ac_params = activation_params

    def forward(self, x):
        x = self.fc(x)
        return self.ac(x, *self.ac_params)

In [None]:
class Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Model, self).__init__()

        self.layers = nn.ModuleList([
            nn.BatchNorm1d(input_dim),
            Perceptron(input_dim, 4*hidden_dim, "silu"),
            nn.BatchNorm1d(4*hidden_dim),
            nn.Dropout(0.25),
            Perceptron(4*hidden_dim, 4*hidden_dim, "leaky_relu"),
            nn.BatchNorm1d(4*hidden_dim),
            nn.Dropout(0.25),
            Perceptron(4*hidden_dim, output_dim, "sigmoid"),
        ])
        
        
    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x) 
        return x 
    
    # def predict(self, x):
    #     for layer in self.layers:
    #         x = layer.forward(x) 
    #     return x

log RMSE

In [None]:
class LogRMSE(torch.nn.modules.loss._Loss):
    def __init__(self, reduction: str = 'none') -> None:
        super(LogRMSE, self).__init__(None, None, reduction)
    
    def forward(self, input: Tensor, target: Tensor) -> Tensor:
        se = torch.pow((target - input), 2)
        mse = torch.mean(se)
        rmse = torch.sqrt(mse)
        logrmse = torch.log(rmse)
        return logrmse


In [None]:
model = Model(train_ds.dim(), 200, 1)

optimizers = {'SGD': torch.optim.SGD(model.parameters(), lr=0.01),
              'Adam': torch.optim.Adam(model.parameters(), lr=0.01),
              'RMSProp': torch.optim.RMSprop(model.parameters(), lr=0.01),
}
criterion = LogRMSE()

In [None]:
for optimizer_name, optimizer in optimizers.items():
  print(f'-------------optimizer {optimizer_name}-------------\n')
  EPOCHS = 5
  for epoch in tqdm(range(EPOCHS)):  
      running_loss = 0.0
      for i, data in enumerate(train_loader, 0):
          inputs, labels = data[0], data[1]

          # обнуляем градиент
          optimizer.zero_grad()
          outputs = model(inputs)
          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()

          # выводим статистику о процессе обучения
          running_loss += loss.item()
          if i % 5000 == 0:    # печатаем каждые 5000 mini-batches
              print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 5000))
              running_loss = 0.0
  print('\nTraining is finished!')

-------------optimizer SGD-------------



  0%|          | 0/5 [00:00<?, ?it/s]

[1,     1] loss: -0.000
[1,  5001] loss: -1.345
[1, 10001] loss: -1.365
[1, 15001] loss: -1.366


 20%|██        | 1/5 [02:53<11:32, 173.20s/it]

[2,     1] loss: -0.000
[2,  5001] loss: -1.366
[2, 10001] loss: -1.365
[2, 15001] loss: -1.366


 40%|████      | 2/5 [05:30<08:11, 163.94s/it]

[3,     1] loss: -0.000
[3,  5001] loss: -1.366
[3, 10001] loss: -1.365
[3, 15001] loss: -1.366


 60%|██████    | 3/5 [08:04<05:18, 159.26s/it]

[4,     1] loss: -0.000
[4,  5001] loss: -1.366
[4, 10001] loss: -1.365
[4, 15001] loss: -1.366


 80%|████████  | 4/5 [10:35<02:36, 156.25s/it]

[5,     1] loss: -0.000
[5,  5001] loss: -1.366
[5, 10001] loss: -1.365
[5, 15001] loss: -1.366


100%|██████████| 5/5 [13:10<00:00, 158.07s/it]



Training is finished!
-------------optimizer Adam-------------



  0%|          | 0/5 [00:00<?, ?it/s]

[1,     1] loss: -0.000
[1,  5001] loss: -1.359
[1, 10001] loss: -1.362
[1, 15001] loss: -1.364


 20%|██        | 1/5 [03:22<13:31, 202.88s/it]

[2,     1] loss: -0.000
[2,  5001] loss: -1.364
[2, 10001] loss: -1.363
[2, 15001] loss: -1.364


 40%|████      | 2/5 [06:44<10:05, 201.96s/it]

[3,     1] loss: -0.000
[3,  5001] loss: -1.364
[3, 10001] loss: -1.363
[3, 15001] loss: -1.364


 60%|██████    | 3/5 [10:41<07:16, 218.27s/it]

[4,     1] loss: -0.000
[4,  5001] loss: -1.364
[4, 10001] loss: -1.363
[4, 15001] loss: -1.364


 80%|████████  | 4/5 [14:29<03:41, 221.88s/it]

[5,     1] loss: -0.000
[5,  5001] loss: -1.364
[5, 10001] loss: -1.363
[5, 15001] loss: -1.364


100%|██████████| 5/5 [18:29<00:00, 221.81s/it]



Training is finished!
-------------optimizer RMSProp-------------



  0%|          | 0/5 [00:00<?, ?it/s]

[1,     1] loss: -0.000
[1,  5001] loss: -1.247
[1, 10001] loss: -1.245
[1, 15001] loss: -1.247


 20%|██        | 1/5 [08:01<32:07, 481.88s/it]

[2,     1] loss: -0.000
[2,  5001] loss: -1.247
[2, 10001] loss: -1.245
[2, 15001] loss: -1.247


 40%|████      | 2/5 [28:08<45:24, 908.12s/it]

[3,     1] loss: -0.000
[3,  5001] loss: -1.247
[3, 10001] loss: -1.245
[3, 15001] loss: -1.247


 60%|██████    | 3/5 [48:27<35:00, 1050.36s/it]

[4,     1] loss: -0.000
[4,  5001] loss: -1.247
