In [1]:
import hashlib
import os
import tarfile
import zipfile 
import requests

#@save
DATA_HUB = dict()
DATA_URL =  'http://d2l-data.s3-accelerate.amazonaws.com/'

In [2]:
def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """Download a file inserted into DATA_HUB, return the local filename."""
    assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}."
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # Hit cache
    print(f'Downloading {fname} from {url}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

In [3]:
def download_extract(name, folder=None):  #@save
    """Download and extract a zip/tar file."""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, 'Only zip/tar files can be extracted.'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():  #@save
    """Download all files in the DATA_HUB."""
    for name in DATA_HUB:
        download(name)

In [4]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

In [5]:
DATA_HUB['kaggle_house_train'] = ( #@save
DATA_URL + 'kaggle_house_pred_train.csv',
'585e9cc93e70b39160e7921475f9bcd7d31219ce')

In [6]:
DATA_HUB['kaggle_house_test'] = ( #@save
DATA_URL + 'kaggle_house_pred_test.csv',
'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')

In [7]:
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))

In [8]:
print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [9]:
print(train_data.iloc[0:4, [1,2,3,-3,-2,-1]])

   MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0          60       RL         65.0       WD        Normal     208500
1          20       RL         80.0       WD        Normal     181500
2          60       RL         68.0       WD        Normal     223500
3          70       RL         60.0       WD       Abnorml     140000


In [10]:
train_data.iloc[:, 1:-1].head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [11]:
test_data.iloc[:,1:].head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


In [12]:
#removing id

all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:,1:]))

In [13]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index

In [14]:
numeric_features[:5]

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond'], dtype='object')

In [18]:
def apply_standardisation(x):
    return (x - x.mean())/x.std()

all_features[numeric_features] = all_features[numeric_features].apply(apply_standardisation)
    

In [19]:
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [20]:
all_features[:5]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.06732,RL,0.216038,-0.217841,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,0.157619,WD,Normal
1,-0.873466,RL,0.664045,-0.072032,Pave,,Reg,Lvl,AllPub,FR2,...,-0.285886,-0.063139,,,,-0.089577,-0.446848,-0.602858,WD,Normal
2,0.06732,RL,0.305639,0.137173,Pave,,IR1,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,1.026577,0.157619,WD,Normal
3,0.302516,RL,0.066702,-0.078371,Pave,,IR1,Lvl,AllPub,Corner,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,-1.363335,WD,Abnorml
4,0.06732,RL,0.783513,0.518814,Pave,,IR1,Lvl,AllPub,FR2,...,-0.285886,-0.063139,,,,-0.089577,2.131647,0.157619,WD,Normal


In [22]:
all_features = pd.get_dummies(all_features, dummy_na=True)

In [23]:
all_features.shape

(2919, 331)

In [26]:
n_train = train_data.shape[0]
n_train

1460

In [28]:
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1,1), dtype=torch.float32)

In [29]:
loss = nn.MSELoss()

In [30]:
train_features.shape

torch.Size([1460, 331])

In [32]:
in_features = train_features.shape[1]

def get_net(in_features= in_features):
    net = nn.Sequential(nn.Linear(in_features,1))
    return net

In [34]:
def log_rmse(net, features, labels, loss):
    y_hat= torch.clamp(net(features), 1, float('inf'))
    
    return torch.sqrt(loss(torch.log(y_hat) - torch.log(y))).item()

In [36]:
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, loss, batch_size, weight_decay, lr):
    train_dataset = torch.utils.data.TensorDataset(*(train_features, train_labels))
    train_iter= torch.utils.data.TrainDataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    
    train_ls = []
    test_ls = []
    
    optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)
    
    for epoch in range(num_epochs):
        for X, y in train_iter:
            y_hat = net(X)
            l = loss(y_hat, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        train_error = log_rmse(net, train_features, train_labels, loss )
        train_ls.append(train_error)
        print(f"The rmse loss for the epoch {epoch} is {train_error}")
        
        if(test_labels is not None):
            test_error = log_rmse(net, test_features, test_labels, loss )
            test_ls.append(test_error)
            print(f"The rmse loss for the epoch {epoch} is {test_error}")
    
    return train_ls, test_ls
        

In [1]:
def get_K_fold(k, i, X, y):
    assert k >1
    
    fold_size = X.shape[0] // k
    
    X_train,y_train = None, None
    
    for j in range(k):
        current_index = slice(j*fold_size, (j+1) * fold_size)
        print(current_index)
        break
        X_part, y_part = X[current_index, :], y[current_index, :]
        
        if j == i :
            val_train = X_part
            val_label = y_part
        else :
            if (X_train==None):
                X_train = X_part
                y_train = y_part
            else :
                X_train  = torch.cat([X_train, X_part],0)
                y_train = torch.cat([y_train, y_part],0)

    return X_train, y_train, val_train, val_label