In [None]:
import pandas as pd
import numpy as np
import torch
import torchvision
from torch import nn,optim
import matplotlib.pyplot as plt
%matplotlib auto

In [2]:
train_data=pd.read_csv(r'F:\study\ml\california-house-prices\train.csv')
test_data=pd.read_csv(r'F:\study\ml\california-house-prices\test.csv')

In [3]:
train_data.shape

(47439, 41)

In [4]:
test_data.shape

(31626, 40)

In [5]:
train_data.head()

Unnamed: 0,Id,Address,Sold Price,Summary,Type,Year built,Heating,Cooling,Parking,Lot,...,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip,State
0,0,540 Pine Ln,3825000.0,"540 Pine Ln, Los Altos, CA 94022 is a single f...",SingleFamily,1969.0,"Heating - 2+ Zones, Central Forced Air - Gas","Multi-Zone, Central AC, Whole House / Attic Fan","Garage, Garage - Attached, Covered",1.0,...,"Garage, Garage - Attached, Covered",886486.0,12580.0,2019-10-24,4198000.0,,,Los Altos,94022,CA
1,1,1727 W 67th St,505000.0,"HURRY, HURRY.......Great house 3 bed and 2 bat...",SingleFamily,1926.0,Combination,"Wall/Window Unit(s), Evaporative Cooling, See ...","Detached Carport, Garage",4047.0,...,"Detached Carport, Garage",505000.0,6253.0,2019-10-16,525000.0,2019-08-30,328000.0,Los Angeles,90047,CA
2,2,28093 Pine Ave,140000.0,'THE PERFECT CABIN TO FLIP! Strawberry deligh...,SingleFamily,1958.0,Forced air,,0 spaces,9147.0,...,,49627.0,468.0,2019-08-25,180000.0,,,Strawberry,95375,CA
3,3,10750 Braddock Dr,1775000.0,Rare 2-story Gated 5 bedroom Modern Mediterran...,SingleFamily,1947.0,Central,Central Air,"Detached Carport, Driveway, Garage - Two Door",,...,"Detached Carport, Driveway, Garage - Two Door",1775000.0,20787.0,2019-10-24,1895000.0,2016-08-30,1500000.0,Culver City,90230,CA
4,4,7415 O Donovan Rd,1175000.0,Beautiful 200 acre ranch land with several pas...,VacantLand,,,,0 spaces,,...,,,,2019-06-07,1595000.0,2016-06-27,900000.0,Creston,93432,CA


In [6]:
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1]))

In [7]:
numeric_features=all_features.dtypes[all_features.dtypes != 'object'].index

In [8]:
numeric_features

Index(['Annual tax amount', 'Bathrooms', 'Elementary School Distance',
       'Elementary School Score', 'Full bathrooms', 'Garage spaces',
       'High School Distance', 'High School Score', 'Last Sold Price',
       'Listed Price', 'Lot', 'Middle School Distance', 'Middle School Score',
       'Sold Price', 'Tax assessed value', 'Total interior livable area',
       'Total spaces', 'Year built', 'Zip'],
      dtype='object')

In [9]:
numeric_features_name=numeric_features.tolist()
numeric_features_name

['Annual tax amount',
 'Bathrooms',
 'Elementary School Distance',
 'Elementary School Score',
 'Full bathrooms',
 'Garage spaces',
 'High School Distance',
 'High School Score',
 'Last Sold Price',
 'Listed Price',
 'Lot',
 'Middle School Distance',
 'Middle School Score',
 'Sold Price',
 'Tax assessed value',
 'Total interior livable area',
 'Total spaces',
 'Year built',
 'Zip']

In [10]:
numeric_features_name.remove('Sold Price')

In [11]:
numeric_features_name

['Annual tax amount',
 'Bathrooms',
 'Elementary School Distance',
 'Elementary School Score',
 'Full bathrooms',
 'Garage spaces',
 'High School Distance',
 'High School Score',
 'Last Sold Price',
 'Listed Price',
 'Lot',
 'Middle School Distance',
 'Middle School Score',
 'Tax assessed value',
 'Total interior livable area',
 'Total spaces',
 'Year built',
 'Zip']

In [12]:
all_features[numeric_features].isna().sum()

Annual tax amount              35936
Bathrooms                      35091
Elementary School Distance     36368
Elementary School Score        36522
Full bathrooms                 39491
Garage spaces                  32543
High School Distance           36627
High School Score              36845
Last Sold Price                49392
Listed Price                   31626
Lot                            45807
Middle School Distance         48330
Middle School Score            48331
Sold Price                     31626
Tax assessed value             35278
Total interior livable area    34152
Total spaces                   32542
Year built                     32671
Zip                            31626
dtype: int64

In [13]:
all_features[numeric_features_name]=all_features[numeric_features_name].apply(lambda x : (x-x.mean()) / x.std())

In [14]:
all_features[numeric_features_name].mean()

Annual tax amount              1.154834e-16
Bathrooms                     -7.522674e-16
Elementary School Distance    -1.295063e-14
Elementary School Score       -1.860748e-15
Full bathrooms                 5.441759e-16
Garage spaces                 -6.411489e-18
High School Distance          -5.113973e-15
High School Score             -2.770104e-15
Last Sold Price               -4.286326e-15
Listed Price                   7.066880e-17
Lot                           -1.141437e-16
Middle School Distance        -1.888407e-15
Middle School Score            1.745464e-15
Tax assessed value             1.565043e-16
Total interior livable area    3.484298e-18
Total spaces                  -3.626773e-16
Year built                     1.295279e-15
Zip                            9.063299e-16
dtype: float64

In [15]:
all_features[numeric_features_name]=all_features[numeric_features_name].fillna(all_features[numeric_features_name].mean())

In [16]:
n_train=train_data.shape[0]

In [17]:
all_features.head()

Unnamed: 0,0,Address,Annual tax amount,Appliances included,Bathrooms,Bedrooms,City,Cooling,Cooling features,Elementary School,...,Parking features,Region,Sold Price,Summary,Tax assessed value,Total interior livable area,Total spaces,Type,Year built,Zip
0,,540 Pine Ln,0.1889303,"Dishwasher, Dryer, Garbage disposal, Microwave...",-1.981522,"Ground Floor Bedroom, Master Bedroom on Ground...",Los Altos,"Multi-Zone, Central AC, Whole House / Attic Fan",Central,Santa Rita Elementary School,...,"Garage, Garage - Attached, Covered",Los Altos,3825000.0,"540 Pine Ln, Los Altos, CA 94022 is a single f...",0.08652149,-0.006935771,-0.1738998,SingleFamily,0.0848073,0.32818
1,,1727 W 67th St,-0.2667658,,-0.2991593,3,Los Angeles,"Wall/Window Unit(s), Evaporative Cooling, See ...","Wall/Window Unit(s), Evaporative Cooling, See ...",Raymond Avenue Elementary School,...,"Detached Carport, Garage",Los Angeles,505000.0,"HURRY, HURRY.......Great house 3 bed and 2 bat...",-0.2429718,-0.005889444,-0.06293187,SingleFamily,-0.2101123,-1.427982
2,,28093 Pine Ave,-0.6834248,,0.5420218,2,Strawberry,,,,...,,Strawberry,140000.0,'THE PERFECT CABIN TO FLIP! Strawberry deligh...,-0.636282,-0.005553082,-0.1738998,SingleFamily,0.009362749,0.925937
3,,10750 Braddock Dr,0.7800315,Dishwasher,0.5420218,5,Culver City,Central Air,Central Air,Farragut Elementary School,...,"Detached Carport, Driveway, Garage - Two Door",Culver City,1775000.0,Rare 2-story Gated 5 bedroom Modern Mediterran...,0.8539399,-0.003799194,-0.1738998,SingleFamily,-0.0660818,-1.347132
4,,7415 O Donovan Rd,1.154834e-16,,-7.522674e-16,,Creston,,,Santa Margarita Elementary School,...,,Creston,1175000.0,Beautiful 200 acre ranch land with several pas...,1.565043e-16,3.484298e-18,-3.626773e-16,VacantLand,1.295279e-15,0.067517


In [18]:
all_features[numeric_features_name].head()

Unnamed: 0,Annual tax amount,Bathrooms,Elementary School Distance,Elementary School Score,Full bathrooms,Garage spaces,High School Distance,High School Score,Last Sold Price,Listed Price,Lot,Middle School Distance,Middle School Score,Tax assessed value,Total interior livable area,Total spaces,Year built,Zip
0,0.1889303,-1.981522,-0.3225956,0.6081615,5.441759e-16,-0.1664093,-0.308768,0.9400138,-4.286326e-15,1.096403,-0.01973466,-1.888407e-15,1.745464e-15,0.08652149,-0.006935771,-0.1738998,0.0848073,0.32818
1,-0.2667658,-0.2991593,-0.151096,-1.293567,-0.09857719,-0.05485591,-0.308768,-2.083096,-0.4073798,-0.300868,-0.01939537,-0.2402037,-1.65631,-0.2429718,-0.005889444,-0.06293187,-0.2101123,-1.427982
2,-0.6834248,0.5420218,-1.295063e-14,-1.860748e-15,-1.136654,-0.1664093,2.138314,-2.770104e-15,-4.286326e-15,-0.432112,-0.0189677,-1.888407e-15,1.745464e-15,-0.636282,-0.005553082,-0.1738998,0.009362749,0.925937
3,0.7800315,0.5420218,-0.4083454,1.559026,0.9394999,-0.1664093,-0.614653,0.9400138,0.5876091,0.220303,-1.141437e-16,-0.6056297,0.8402341,0.8539399,-0.003799194,-0.1738998,-0.0660818,-1.347132
4,1.154834e-16,-7.522674e-16,3.150272,0.1327294,5.441759e-16,-6.411489e-18,2.277353,-0.06768939,0.07822912,0.106178,-1.141437e-16,3.454659,-0.1583836,1.565043e-16,3.484298e-18,-3.626773e-16,1.295279e-15,0.067517


In [19]:
train_features=torch.Tensor(all_features[numeric_features_name].iloc[:n_train,:].values)
test_features=torch.Tensor(all_features[numeric_features_name].iloc[n_train:,:].values)
train_labels=torch.Tensor(train_data['Sold Price'].values).reshape(-1,1)

In [20]:
loss=nn.MSELoss()

In [21]:
train_features.shape

torch.Size([47439, 18])

In [22]:
test_features.shape

torch.Size([31626, 18])

In [23]:
train_labels.shape

torch.Size([47439, 1])

In [24]:
def log_rmse(net,features,labels):
    clipped_preds=torch.clamp(net(features),1,float('inf'))
    rmse=torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
    return rmse.item()

In [25]:
def train_k_fold_data(net,num_epochs,lr,train_features,train_labels,test_features,test_labels,batch_size,montum,wd):
#     net=nn.Linear(train_features.shape[1],1)
#     net=nn.Sequential(nn.Linear(train_features.shape[1],1))
    loss=nn.MSELoss()
#     optimizer=optim.SGD(net.parameters(),lr=lr,momentum=montum,weight_decay=wd)
    optimizer=optim.Adam(net.parameters(),lr=lr,weight_decay=wd)
    dataset=torch.utils.data.TensorDataset(train_features,train_labels)
    data_iter=torch.utils.data.DataLoader(dataset,batch_size,shuffle=True)
    train_l,test_l=[],[]
    
    min_test_loss=10000
    early_stop_cnt=0
    train_loss,test_loss=0,0
    
    for e in range(num_epochs):
        net.train()
        for x,y in data_iter:
            l=loss(net(x),y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
#         if (e+1) %1000==0 and test_features is not  None:
        net.eval()
        with torch.no_grad():
            test_loss=log_rmse(net,test_features,test_labels)
            if test_loss<min_test_loss:
                min_test_loss=test_loss
#                 test_l.append(test_loss)
                train_loss=log_rmse(net,train_features,train_labels)
                test_l.append(test_loss)
                train_l.append(train_loss)
                print('epoch = %d train_loss : %f , test loss : %f' % (e+1,train_loss,test_loss))
                early_stop_cnt=0
            else:
                early_stop_cnt+=1
        if early_stop_cnt > 500:
            
            break
                
#             net.eval()
#             with torch.no_grad():
#                 train_l.append(loss(net(train_features),train_labels).item())
#                 test_l.append(loss(net(test_features),test_labels).item())
# #                 print('epoch ',(e+1),'train loss : ',train_l[-1],'test loss : ',test_l[-1])
    print('train log loss: ',train_loss)
    print('test log loss: ',test_loss)
    return train_l,test_l

In [26]:
def get_kfold_data(k,j,x,y,random_state=13):
    assert k>=1, 'k must >=1'
    fold_size = x.shape[0] // k
    x_train,y_train=None,None
    row_list=list(range(x.shape[0]))
    np.random.seed(random_state)
    np.random.shuffle(row_list)
    for i in range(k):
        idx=slice(fold_size*i,fold_size*(i+1))
        x_part,y_part=x[row_list[idx],:],y[row_list[idx],:]
        if i==j:
            x_val,y_val=x_part,y_part
        elif x_train is None:
            x_train,y_train=x_part,y_part
        else:
            x_train=torch.cat((x_train,x_part))
            y_train=torch.cat((y_train,y_part))
    return x_train,y_train,x_val,y_val

In [27]:
def train_kfold(net1,num_epochs,lr,k,x_data,y_data,batch_size,montum,wd):
    train_sum_l,test_sum_l=[],[]
    train_l,test_l=[],[]
#     net=nn.Linear(x_data.shape[1])
    for j in range(k):
        net1=get_net()
#         for p in net1.parameters():
#             torch.nn.init.normal_(p)
        net=net1
        data=get_kfold_data(k,j,x_data,y_data)
        train_l,test_l=train_k_fold_data(net,num_epochs,lr,*data,batch_size,montum,wd)
        train_sum_l.append(train_l[-1])
        test_sum_l.append(test_l[-1])
        print('--------------------------------------------------------------------------')
        print('fold %d,train loss mean : %f,test loss : %f' % (j+1,train_l[-1],test_l[-1]))
        print('-------------------------------------------------------------------------')
    print('%d fold ,total train loss mean : %f,total test loss mean : %f ' % (k,np.mean(train_sum_l),np.mean(test_sum_l)))
    print('-----------------------------------------------------------------------------')

In [None]:
def get_net():
    return nn.Sequential(nn.Linear(train_features.shape[1],256),nn.ReLU(),nn.Linear(256,128),nn.ReLU(),nn.Linear(128,64),nn.ReLU(),nn.Linear(64,32),
                        nn.ReLU(),nn.Linear(32,1))
net1=get_net()
train_kfold(net1,100000,0.0001,5,train_features,train_labels,256,0,0)

epoch = 1 train_loss : 12.633021 , test loss : 12.640697
epoch = 2 train_loss : 9.657457 , test loss : 9.664833
epoch = 3 train_loss : 7.819559 , test loss : 7.827017
epoch = 4 train_loss : 6.505435 , test loss : 6.513237
epoch = 5 train_loss : 5.475049 , test loss : 5.483235
epoch = 6 train_loss : 4.666697 , test loss : 4.675470
epoch = 7 train_loss : 3.986810 , test loss : 3.996228
epoch = 8 train_loss : 3.424311 , test loss : 3.434443
epoch = 9 train_loss : 2.939010 , test loss : 2.949781
epoch = 10 train_loss : 2.526063 , test loss : 2.537444
epoch = 11 train_loss : 2.173831 , test loss : 2.185898
epoch = 12 train_loss : 1.869921 , test loss : 1.882700
epoch = 13 train_loss : 1.612511 , test loss : 1.625983
epoch = 14 train_loss : 1.401232 , test loss : 1.415331
epoch = 15 train_loss : 1.228104 , test loss : 1.242701
epoch = 16 train_loss : 1.093461 , test loss : 1.108346
epoch = 17 train_loss : 0.990293 , test loss : 1.005203
epoch = 18 train_loss : 0.913469 , test loss : 0.928120

epoch = 196 train_loss : 0.487265 , test loss : 0.482373
epoch = 203 train_loss : 0.487244 , test loss : 0.482298
epoch = 204 train_loss : 0.487166 , test loss : 0.482213
epoch = 205 train_loss : 0.486206 , test loss : 0.481279
epoch = 213 train_loss : 0.485964 , test loss : 0.480988
epoch = 224 train_loss : 0.485491 , test loss : 0.480500
epoch = 225 train_loss : 0.485089 , test loss : 0.480115
epoch = 226 train_loss : 0.484773 , test loss : 0.479817
epoch = 233 train_loss : 0.484576 , test loss : 0.479609
epoch = 234 train_loss : 0.484477 , test loss : 0.479520
epoch = 238 train_loss : 0.484391 , test loss : 0.479406
epoch = 239 train_loss : 0.484168 , test loss : 0.479191
epoch = 240 train_loss : 0.483495 , test loss : 0.478564
epoch = 248 train_loss : 0.483442 , test loss : 0.478484
epoch = 253 train_loss : 0.483156 , test loss : 0.478211
epoch = 254 train_loss : 0.483028 , test loss : 0.478067
epoch = 255 train_loss : 0.482886 , test loss : 0.477919
epoch = 258 train_loss : 0.4828

epoch = 739 train_loss : 0.448533 , test loss : 0.443278
epoch = 740 train_loss : 0.448439 , test loss : 0.443187
epoch = 742 train_loss : 0.447812 , test loss : 0.442603
epoch = 746 train_loss : 0.447755 , test loss : 0.442514
epoch = 747 train_loss : 0.446978 , test loss : 0.441787
epoch = 748 train_loss : 0.446423 , test loss : 0.441201
epoch = 751 train_loss : 0.446101 , test loss : 0.440872
epoch = 752 train_loss : 0.445452 , test loss : 0.440244
epoch = 759 train_loss : 0.444356 , test loss : 0.439129
epoch = 761 train_loss : 0.444111 , test loss : 0.438923
epoch = 764 train_loss : 0.443127 , test loss : 0.437947
epoch = 765 train_loss : 0.443140 , test loss : 0.437873
epoch = 769 train_loss : 0.442202 , test loss : 0.436981
epoch = 774 train_loss : 0.441591 , test loss : 0.436345
epoch = 775 train_loss : 0.441123 , test loss : 0.435901
epoch = 776 train_loss : 0.440905 , test loss : 0.435673
epoch = 777 train_loss : 0.440539 , test loss : 0.435321
epoch = 781 train_loss : 0.4393

epoch = 1059 train_loss : 0.343911 , test loss : 0.332690
epoch = 1062 train_loss : 0.342255 , test loss : 0.331335
epoch = 1063 train_loss : 0.339983 , test loss : 0.328916
epoch = 1067 train_loss : 0.339766 , test loss : 0.328781
epoch = 1068 train_loss : 0.338222 , test loss : 0.327217
epoch = 1074 train_loss : 0.336790 , test loss : 0.325734
epoch = 1078 train_loss : 0.334059 , test loss : 0.323243
epoch = 1086 train_loss : 0.333504 , test loss : 0.322646
epoch = 1088 train_loss : 0.332274 , test loss : 0.321420
epoch = 1089 train_loss : 0.331835 , test loss : 0.321000
epoch = 1092 train_loss : 0.331462 , test loss : 0.320774
epoch = 1093 train_loss : 0.330812 , test loss : 0.320107
epoch = 1094 train_loss : 0.328942 , test loss : 0.318117
epoch = 1099 train_loss : 0.327502 , test loss : 0.316716
epoch = 1106 train_loss : 0.324489 , test loss : 0.314129
epoch = 1110 train_loss : 0.324107 , test loss : 0.313596
epoch = 1111 train_loss : 0.322604 , test loss : 0.312153
epoch = 1123 t

In [None]:
def get_net():
    return nn.Sequential(nn.Linear(train_features.shape[1],128),nn.ReLU(),nn.Linear(128,64),nn.ReLU(),nn.Linear(64,32),nn.ReLU(),nn.Linear(32,1))
net1=get_net()
train_kfold(net1,100000,0.0002,5,train_features,train_labels,128,0,0)