In [None]:
import pandas as pd
import numpy as np
import torch
import torchvision
from torch import nn,optim
import matplotlib.pyplot as plt
%matplotlib auto

In [2]:
train_data=pd.read_csv(r'F:\study\ml\california-house-prices\train.csv')
test_data=pd.read_csv(r'F:\study\ml\california-house-prices\test.csv')

In [3]:
train_data.shape

(47439, 41)

In [4]:
test_data.shape

(31626, 40)

In [5]:
train_data.head()

Unnamed: 0,Id,Address,Sold Price,Summary,Type,Year built,Heating,Cooling,Parking,Lot,...,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip,State
0,0,540 Pine Ln,3825000.0,"540 Pine Ln, Los Altos, CA 94022 is a single f...",SingleFamily,1969.0,"Heating - 2+ Zones, Central Forced Air - Gas","Multi-Zone, Central AC, Whole House / Attic Fan","Garage, Garage - Attached, Covered",1.0,...,"Garage, Garage - Attached, Covered",886486.0,12580.0,2019-10-24,4198000.0,,,Los Altos,94022,CA
1,1,1727 W 67th St,505000.0,"HURRY, HURRY.......Great house 3 bed and 2 bat...",SingleFamily,1926.0,Combination,"Wall/Window Unit(s), Evaporative Cooling, See ...","Detached Carport, Garage",4047.0,...,"Detached Carport, Garage",505000.0,6253.0,2019-10-16,525000.0,2019-08-30,328000.0,Los Angeles,90047,CA
2,2,28093 Pine Ave,140000.0,'THE PERFECT CABIN TO FLIP! Strawberry deligh...,SingleFamily,1958.0,Forced air,,0 spaces,9147.0,...,,49627.0,468.0,2019-08-25,180000.0,,,Strawberry,95375,CA
3,3,10750 Braddock Dr,1775000.0,Rare 2-story Gated 5 bedroom Modern Mediterran...,SingleFamily,1947.0,Central,Central Air,"Detached Carport, Driveway, Garage - Two Door",,...,"Detached Carport, Driveway, Garage - Two Door",1775000.0,20787.0,2019-10-24,1895000.0,2016-08-30,1500000.0,Culver City,90230,CA
4,4,7415 O Donovan Rd,1175000.0,Beautiful 200 acre ranch land with several pas...,VacantLand,,,,0 spaces,,...,,,,2019-06-07,1595000.0,2016-06-27,900000.0,Creston,93432,CA


In [6]:
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1]))

In [7]:
numeric_features=all_features.dtypes[all_features.dtypes != 'object'].index

In [8]:
numeric_features

Index(['Annual tax amount', 'Bathrooms', 'Elementary School Distance',
       'Elementary School Score', 'Full bathrooms', 'Garage spaces',
       'High School Distance', 'High School Score', 'Last Sold Price',
       'Listed Price', 'Lot', 'Middle School Distance', 'Middle School Score',
       'Sold Price', 'Tax assessed value', 'Total interior livable area',
       'Total spaces', 'Year built', 'Zip'],
      dtype='object')

In [9]:
numeric_features_name=numeric_features.tolist()
numeric_features_name

['Annual tax amount',
 'Bathrooms',
 'Elementary School Distance',
 'Elementary School Score',
 'Full bathrooms',
 'Garage spaces',
 'High School Distance',
 'High School Score',
 'Last Sold Price',
 'Listed Price',
 'Lot',
 'Middle School Distance',
 'Middle School Score',
 'Sold Price',
 'Tax assessed value',
 'Total interior livable area',
 'Total spaces',
 'Year built',
 'Zip']

In [10]:
numeric_features_name.remove('Sold Price')

In [11]:
numeric_features_name

['Annual tax amount',
 'Bathrooms',
 'Elementary School Distance',
 'Elementary School Score',
 'Full bathrooms',
 'Garage spaces',
 'High School Distance',
 'High School Score',
 'Last Sold Price',
 'Listed Price',
 'Lot',
 'Middle School Distance',
 'Middle School Score',
 'Tax assessed value',
 'Total interior livable area',
 'Total spaces',
 'Year built',
 'Zip']

In [12]:
all_features[numeric_features].isna().sum()

Annual tax amount              35936
Bathrooms                      35091
Elementary School Distance     36368
Elementary School Score        36522
Full bathrooms                 39491
Garage spaces                  32543
High School Distance           36627
High School Score              36845
Last Sold Price                49392
Listed Price                   31626
Lot                            45807
Middle School Distance         48330
Middle School Score            48331
Sold Price                     31626
Tax assessed value             35278
Total interior livable area    34152
Total spaces                   32542
Year built                     32671
Zip                            31626
dtype: int64

In [13]:
all_features[numeric_features_name]=all_features[numeric_features_name].apply(lambda x : (x-x.mean()) / x.std())

In [14]:
all_features[numeric_features_name].mean()

Annual tax amount              1.154834e-16
Bathrooms                     -7.522674e-16
Elementary School Distance    -1.295063e-14
Elementary School Score       -1.860748e-15
Full bathrooms                 5.441759e-16
Garage spaces                 -6.411489e-18
High School Distance          -5.113973e-15
High School Score             -2.770104e-15
Last Sold Price               -4.286326e-15
Listed Price                   7.066880e-17
Lot                           -1.141437e-16
Middle School Distance        -1.888407e-15
Middle School Score            1.745464e-15
Tax assessed value             1.565043e-16
Total interior livable area    3.484298e-18
Total spaces                  -3.626773e-16
Year built                     1.295279e-15
Zip                            9.063299e-16
dtype: float64

In [15]:
all_features[numeric_features_name]=all_features[numeric_features_name].fillna(all_features[numeric_features_name].mean())

In [16]:
n_train=train_data.shape[0]

In [17]:
all_features.head()

Unnamed: 0,0,Address,Annual tax amount,Appliances included,Bathrooms,Bedrooms,City,Cooling,Cooling features,Elementary School,...,Parking features,Region,Sold Price,Summary,Tax assessed value,Total interior livable area,Total spaces,Type,Year built,Zip
0,,540 Pine Ln,0.1889303,"Dishwasher, Dryer, Garbage disposal, Microwave...",-1.981522,"Ground Floor Bedroom, Master Bedroom on Ground...",Los Altos,"Multi-Zone, Central AC, Whole House / Attic Fan",Central,Santa Rita Elementary School,...,"Garage, Garage - Attached, Covered",Los Altos,3825000.0,"540 Pine Ln, Los Altos, CA 94022 is a single f...",0.08652149,-0.006935771,-0.1738998,SingleFamily,0.0848073,0.32818
1,,1727 W 67th St,-0.2667658,,-0.2991593,3,Los Angeles,"Wall/Window Unit(s), Evaporative Cooling, See ...","Wall/Window Unit(s), Evaporative Cooling, See ...",Raymond Avenue Elementary School,...,"Detached Carport, Garage",Los Angeles,505000.0,"HURRY, HURRY.......Great house 3 bed and 2 bat...",-0.2429718,-0.005889444,-0.06293187,SingleFamily,-0.2101123,-1.427982
2,,28093 Pine Ave,-0.6834248,,0.5420218,2,Strawberry,,,,...,,Strawberry,140000.0,'THE PERFECT CABIN TO FLIP! Strawberry deligh...,-0.636282,-0.005553082,-0.1738998,SingleFamily,0.009362749,0.925937
3,,10750 Braddock Dr,0.7800315,Dishwasher,0.5420218,5,Culver City,Central Air,Central Air,Farragut Elementary School,...,"Detached Carport, Driveway, Garage - Two Door",Culver City,1775000.0,Rare 2-story Gated 5 bedroom Modern Mediterran...,0.8539399,-0.003799194,-0.1738998,SingleFamily,-0.0660818,-1.347132
4,,7415 O Donovan Rd,1.154834e-16,,-7.522674e-16,,Creston,,,Santa Margarita Elementary School,...,,Creston,1175000.0,Beautiful 200 acre ranch land with several pas...,1.565043e-16,3.484298e-18,-3.626773e-16,VacantLand,1.295279e-15,0.067517


In [19]:
train_features=torch.Tensor(all_features[numeric_features_name].iloc[:n_train].values)
test_features=torch.Tensor(all_features[numeric_features_name].iloc[n_train].values)
train_labels=torch.Tensor(train_data['Sold Price'])

In [47]:
loss=nn.MSELoss()

In [48]:
def log_rmse(net,features,labels):
    with torch.no_grad():
        clipped_pred=torch.max(net(features),torch.Tensor([1.]))
        rmse=torch.sqrt(2*(loss(clipped_pred.log(),labels.log())).mean())
    return rmse.item()

In [49]:
def train(net,train_features,train_labels,test_features,
         test_labels,num_epochs,lr,weight_decay,batch_size):
    train_l,test_l=[],[]
    dataset=torch.utils.data.TensorDataset(train_features,train_labels)
    train_iter=torch.utils.data.DataLoader(dataset,batch_size,shuffle=True)
    optimizer=optim.Adam(net.parameters(),lr,weight_decay=weight_decay)
    loss=nn.MSELoss()
    
    for i in range(num_epochs):
        for x,y in train_iter:
            l=loss(net(x),y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        train_l.append(log_rmse(net,train_features,train_labels))
        if test_labels is not None:
            test_l.append(log_rmse(net,test_features,test_labels))
    return train_l,test_labels

In [50]:
def get_k_fold_data(k,i,x,y):
    assert k>1
    fold_size=x.shape[0]//k
    x_train,y_train=None,None
    data_idx=list(range(x.shape[0]))
    np.random.shuffle(data_idx)
    for j in range(k):
        idx=slice(j*fold_size,(j+1) * fold_size)
        x_part,y_part=x[data_idx[idx],:],y[data_idx[idx]]
        
        if j==i:
            x_valid,y_valid=x_part,y_part
        elif x_train is None:
            x_train,y_train=x_part,y_part
        else:
            x_train=torch.cat((x_train,x_part))
            y_train=torch.cat((y_train,y_part))
    return x_train,y_train,x_valid,y_valid

In [51]:
def k_fold(k,x_train,y_train,num_epochs,lr,weight_decay,batch_size):
    train_l_sum,valid_l_sum=0,0
    for i in range(k):
        data=get_k_fold_data(k,i,x_train,y_train)
        net=get_net(x_train.shape[1])
        train_l,valid_l=train(net,*data,num_epochs,lr,weight_decay,batch_size)
        train_l_sum+=train_l[-1]
        valid_l_sum+=valid_l[-1]
        
        print('fold %d,train rmse %f,valid %f'  % (
        i+1,train_l[-1],valid_l[-1]))
    return train_l_sum,valid_l_sum

In [52]:
k,num_epochs,lr,weight_decay,batch_size=5,100,5,0,64

In [53]:
def get_net(feature_num):
    net=nn.Sequential(nn.Linear(feature_num,128),nn.ReLU(),nn.Linear(128,1))
    return net

In [54]:
# net=nn.Sequential(nn.Linear(train_features.shape[1],128),nn.ReLU(),nn.Linear(128,1))
train_l_sum,valid_l_sum=k_fold(k,train_features,train_labels,num_epochs,
                          lr,weight_decay,batch_size)
print('%d fold validation : avg train rmse %f, avg valid rmse %f' % (
k,train_l_sum/k,valid_l_sum/k))

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 5760202816 bytes. Buy new RAM!