# Домашнее задание

Будем практиковаться на датасете недвижимости (sklearn.datasets.fetch_california_housing)

Ваша задача:
1. Создать Dataset для загрузки данных
2. Обернуть его в Dataloader
3. Написать архитектуру сети, которая предсказывает стоимость недвижимости. Сеть должна включать BatchNorm слои и Dropout (или НЕ включать, но нужно обосновать)
4. Сравните сходимость Adam, RMSProp и SGD, сделайте вывод по качеству работы модели

train-test разделение нужно сделать с помощью sklearn random_state=13, test_size = 0.25

In [1]:
import numpy as np
import pandas as pd
import math
import torch

import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

from PIL import Image
from torchvision import transforms, datasets

import torch.nn.functional as F
import torch.nn as nn

from sklearn.metrics import r2_score

In [2]:
california = fetch_california_housing()
feature_names = california.feature_names
data = california.data
df = pd.DataFrame(data, columns=feature_names)
df['target'] = california.target
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
df.shape

(20640, 9)

In [4]:
X = california['data']
y = california['target']

In [5]:
# Масштабирование
X_sc = StandardScaler().fit(X).transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=13)
print(X_train.shape, type(X_train))
print(X_test.shape, type(X_test))
print(y_train.shape, type(y_train))
print(y_test.shape, type(y_test))

(15480, 8) <class 'numpy.ndarray'>
(5160, 8) <class 'numpy.ndarray'>
(15480,) <class 'numpy.ndarray'>
(5160,) <class 'numpy.ndarray'>


In [7]:
class MyCalifornia(torch.utils.data.Dataset):
    
    def __init__(self,df_train, y_train):
        
        df_train = np.array(df_train)
        y_train = np.array(y_train)
        self.x=torch.from_numpy(df_train)
        self.y = torch.from_numpy(y_train)
        self.n_samples= df_train.shape[0]
        
    
    def __getitem__(self, index):
        
        return self.x[index], self.y[index]
        
    def __len__(self):
        return self.n_samples
    

In [8]:
trans_actions = transforms.Compose([transforms.ToTensor()])

dataset =  MyCalifornia(X_train, y_train)

In [9]:
first_data = dataset[0]
first_data

(tensor([ 3.5174e+00,  3.6000e+01,  4.5479e+00,  1.0944e+00,  1.3570e+03,
          2.0654e+00,  3.4210e+01, -1.1823e+02], dtype=torch.float64),
 tensor(2.6800, dtype=torch.float64))

In [10]:
features, labels = first_data
print(features, labels)

tensor([ 3.5174e+00,  3.6000e+01,  4.5479e+00,  1.0944e+00,  1.3570e+03,
         2.0654e+00,  3.4210e+01, -1.1823e+02], dtype=torch.float64) tensor(2.6800, dtype=torch.float64)


In [11]:
train_loader = torch.utils.data.DataLoader(dataset,
                          batch_size=4,
                          shuffle=True, num_workers=2)

In [12]:
test_dataset = MyCalifornia(X_test, y_test)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8,
                                          shuffle=False)

In [13]:
batch_size = 100
num_epochs = 10
learning_rate = 0.1
size_hidden= 50

batch_no = len(X_train)
cols=X_train.shape[1] 
n_output=1

class Net(torch.nn.Module):
    def __init__(self, n_feature, size_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(cols, 4*size_hidden)   # hidden layer
        self.fc2 = nn.Linear(4 * size_hidden, 2 * size_hidden)
        self.fc3 = nn.Linear(2 * size_hidden, size_hidden)
        self.bn = nn.BatchNorm1d(size_hidden)
        self.dp = nn.Dropout(0.25)
        self.predict = torch.nn.Linear(size_hidden, n_output)   # output layer

    def forward(self, x):
        x = self.hidden(x)
        x = F.relu(x)      # activation function for hidden layer
        x = self.fc2(x)
        x = F.leaky_relu(x)
        x = self.fc3(x)
        x = F.leaky_relu(x)
        x = self.dp(x)
        x = self.bn(x)
        x = self.predict(x)             # linear output
        return x
    
net = Net(cols, size_hidden, n_output)

In [14]:
optimizer_1 = [torch.optim.Adam(net.parameters(), lr=0.1),"Adam"]
optimizer_2 = [torch.optim.RMSprop(net.parameters(), lr=0.1, alpha=0.99),"RMSprop"]
optimizer_4 = [torch.optim.Adagrad(net.parameters(), lr=0.1), "Adagrad"]
optimizer_3 = [torch.optim.SGD(net.parameters(), lr=0.1),"SGD"]
criterion = torch.nn.MSELoss(size_average=False)



In [15]:
from sklearn.utils import shuffle
from torch.autograd import Variable
# running_loss = 0.0
for optimizer in [optimizer_1, optimizer_2,optimizer_4,optimizer_3]:
    print(optimizer[1])
    optimizer=optimizer[0]
    running_loss, running_items, running_right = 0.0, 0.0, 0.0
    for epoch in range(num_epochs):
        #Shuffle just mixes up the dataset between epocs
        X_train, y_train = shuffle(X_train, y_train)
        # Mini batch learning
        for i in range(batch_no):
            start = i * batch_size
            end = start + batch_size
            inputs = Variable(torch.FloatTensor(X_train[start:end]))
            labels = Variable(torch.FloatTensor(y_train[start:end]))
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            #print("outputs",outputs)
            #print("outputs",outputs,outputs.shape,"labels",labels, labels.shape)
            loss = criterion(outputs, torch.unsqueeze(labels,dim=1))
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            running_items += len(labels)


        print('Epoch {}'.format(epoch+1), "loss: ",running_loss, running_items)
        running_loss = 0.0

    X = Variable(torch.FloatTensor(X_train)) 
    result = net(X)
    pred=result.data[:,0].numpy()
    print(len(pred),len(y_train))
    print(f"r2_score {r2_score(pred,y_train)}")

Adam
Epoch 1 loss:  23792.753440856934 15480.0
Epoch 2 loss:  44071.32302093506 30960.0
Epoch 3 loss:  43361.70469665527 46440.0
Epoch 4 loss:  45135.29837036133 61920.0
Epoch 5 loss:  41714.28016662598 77400.0
Epoch 6 loss:  30208.643913269043 92880.0
Epoch 7 loss:  56417.70571899414 108360.0
Epoch 8 loss:  32026.065399169922 123840.0
Epoch 9 loss:  46050.5751953125 139320.0
Epoch 10 loss:  54175.84662628174 154800.0
15480 15480
r2_score -17.947949640048762
RMSprop
Epoch 1 loss:  491598.0480270386 15480.0
Epoch 2 loss:  807722.1161804199 30960.0
Epoch 3 loss:  1063910.1572113037 46440.0
Epoch 4 loss:  984378.9000701904 61920.0
Epoch 5 loss:  1238698.4234085083 77400.0
Epoch 6 loss:  724665.4023895264 92880.0
Epoch 7 loss:  769299.3152694702 108360.0
Epoch 8 loss:  747007.7965927124 123840.0
Epoch 9 loss:  951812.1020507812 139320.0
Epoch 10 loss:  966501.1037979126 154800.0
15480 15480
r2_score -8.016241555021658
Adagrad
Epoch 1 loss:  25455.787796020508 15480.0
Epoch 2 loss:  18936.9

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

без Dropout существенно теряется качество

BatchNorm1d нормализация, также нормальзовал данные перед входом в сеть

по цифрам видно отличие результата оптимизаторов, на мой взгляд Adam всех практичнее. SGD выдает nan на loss, не понял почему