In [1]:
import pandas as pd
import numpy as np
import sklearn.preprocessing
import sklearn.metrics
import torch
from torch.autograd import Variable

In [3]:
### read in 1st X rows of big data
data = pd.read_csv('../data/assembled_data.csv', nrows = 2000000-380)
data.head()

KeyboardInterrupt: 

In [3]:
### read in census data as example static variables
census = pd.read_csv('../data/sensor_locations_with_census.csv')
site_lat_long = census.loc[:, ('Site_ID', 'Lat', 'Lon')]
census_reduced = census.iloc[:, 8:-1]
census_reduced = pd.concat([site_lat_long, census_reduced], axis = 1)
census_reduced.head()

Unnamed: 0,Site_ID,Lat,Lon,Population,Land_Sq_Mi,Population_Density,White,Black,Native,Asian,...,Age_60_p,Age_70_p,Income_0_p,Income_25_p,Income_50_p,Income_75_p,Income_100_p,Income_150_p,Income_200_p,Family_p
0,1,30.49748,-87.88026,32285.0,73.738,437.8,28378.0,1901.0,43.0,213.0,...,0.142,0.142,0.061,0.084,0.073,0.047,0.054,0.022,0.03,0.716
1,2,33.28493,-85.80361,5195.0,144.453,36.0,4457.0,504.0,42.0,0.0,...,0.114,0.136,0.171,0.124,0.061,0.03,0.025,0.002,0.002,0.687
2,3,34.76262,-87.6381,16861.0,55.916,301.5,13570.0,2666.0,110.0,64.0,...,0.107,0.134,0.095,0.098,0.084,0.052,0.05,0.018,0.005,0.706
3,4,34.28857,-85.96986,9691.0,78.702,123.1,6789.0,4.0,52.0,22.0,...,0.054,0.111,0.109,0.086,0.056,0.022,0.017,0.002,0.002,0.738
4,5,33.99149,-85.99265,17106.0,67.416,253.7,10507.0,5395.0,9.0,16.0,...,0.134,0.102,0.154,0.114,0.069,0.026,0.024,0.007,0.008,0.607


In [4]:
data_final = data.merge(right = census_reduced, left_on = 'site', right_on = 'Site_ID', how = 'left')
data_final = data_final.drop(['Site_ID'], axis = 1)
data_final.head()

Unnamed: 0,site,year,date,MonitorData,GFEDFireCarbon,USElevation_dsc10000,USElevation_max100,USElevation_max10000,USElevation_mea100,USElevation_mea10000,...,Age_60_p,Age_70_p,Income_0_p,Income_25_p,Income_50_p,Income_75_p,Income_100_p,Income_150_p,Income_200_p,Family_p
0,1,2000,2000-01-01,,0.001167,26.790501,43,30.143499,36.0,26.504299,...,0.142,0.142,0.061,0.084,0.073,0.047,0.054,0.022,0.03,0.716
1,1,2000,2000-01-02,,0.001236,26.790501,43,30.143499,36.0,26.504299,...,0.142,0.142,0.061,0.084,0.073,0.047,0.054,0.022,0.03,0.716
2,1,2000,2000-01-03,,0.001305,26.790501,43,30.143499,36.0,26.504299,...,0.142,0.142,0.061,0.084,0.073,0.047,0.054,0.022,0.03,0.716
3,1,2000,2000-01-04,,0.001373,26.790501,43,30.143499,36.0,26.504299,...,0.142,0.142,0.061,0.084,0.073,0.047,0.054,0.022,0.03,0.716
4,1,2000,2000-01-05,,0.001442,26.790501,43,30.143499,36.0,26.504299,...,0.142,0.142,0.061,0.084,0.073,0.047,0.054,0.022,0.03,0.716


In [5]:
def split_sizes_site(sites):
    """Gets the split sizes to split dataset by site for a dataset with multiple sites.
    
    Arguments:
        sites (array): array indicating the site of each row 
    """
    split_sizes = []
    for i in range(len(sites)):
        if i == 0:
            site = sites[i]
            split_sizes.append(i)
        elif site != sites[i]:
            site = sites[i]
            split_sizes.append(i - (len(split_sizes)-1)*split_sizes[len(split_sizes)-1])
        elif i == len(sites)-1:
            split_sizes.append((i+1) - (len(split_sizes)-1)*split_sizes[len(split_sizes)-1])
    
    split_sizes = split_sizes[1:]
    return split_sizes


def split_data(tensor, split_sizes, dim=0):
    """Splits the tensor according to chunks of split_sizes.
    
    Arguments:
        tensor (Tensor): tensor to split.
        split_sizes (list(int)): sizes of chunks
        dim (int): dimension along which to split the tensor.
    """
    if dim < 0:
        dim += tensor.dim()
    
    dim_size = tensor.size(dim)
    if dim_size != torch.sum(torch.Tensor(split_sizes)):
        raise KeyError("Sum of split sizes exceeds tensor dim")
    
    splits = torch.cumsum(torch.Tensor([0] + split_sizes), dim=0)[:-1]
    return tuple(tensor.narrow(int(dim), int(start), int(length)) 
        for start, length in zip(splits, split_sizes))


def pad_stack_splits(site_tuple, split_sizes, x_or_y):
    """Zero (x) or nan (y) pads site data sequences and stacks them into a matrix.
    
    Arguments:
        site_tuple (tuple): tuple of site data sequences to pad and stack
        split_sizes (array): lengths of site data sequences
        x_or_y (string): 'x' or 'y' indicating whether to pad and stack x or y
    """
    data_padded_list = []
    for sequence in site_tuple:
        max_sequence_length = torch.max(torch.from_numpy(split_sizes))

        if x_or_y == 'x':
            zero_padding_rows = torch.zeros(max_sequence_length - sequence.size()[0], sequence.size()[1])
            data_padded_list.append(torch.cat((sequence, zero_padding_rows), dim = 0))
            
        elif x_or_y == 'y':
            nan_padding = torch.zeros(max_sequence_length - sequence.size()[0]).double() * np.nan
            data_padded_list.append(torch.cat((sequence, nan_padding), dim = 0))
            
    return torch.stack(data_padded_list, dim = 0)


def get_monitorData_indices(sequence):
    """Gets indices for a site data sequence for which there is an output for MonitorData.
    
    Arguments:
        sequence (tensor): sequence of MonitorData outputs for a given site, including NaNs
    """
    response_indicator_vec = sequence == sequence
    num_responses = torch.sum(response_indicator_vec)
    response_indices = torch.sort(response_indicator_vec, dim = 0, descending = True)[1][:num_responses]
    ordered_response_indices = torch.sort(response_indices)[0]
    return ordered_response_indices


def r2(model, batch_size, x_stack_vary, x_tuple_static, y_tuple):
    """Computes R-squared
    
    Arguments:
        model (torch): model to test
        batch_size (int): to determine how many sequences to read in at a time
        x_stack (tensor): stack of site data sequences
        y_tuple (tuple): tuple of true y values by sequence, including NaNs
    
    """
    y = []
    pred = []
    
    # get number of batches
    if x_stack_vary.size()[0] % batch_size != 0:
        num_batches = int(np.floor(x_stack_vary.size()[0]/batch_size) + 1)
    else:
        num_batches = int(x_stack_vary.size()[0]/batch_size)
        
    for batch in range(num_batches):
        # get x and y for this batch
        x_stack_batch_vary = x_stack_vary[batch_size * batch:batch_size * (batch+1)]
        x_tuple_batch_static = x_tuple_static[batch_size * batch:batch_size * (batch+1)]
        y_tuple_nans = y_tuple[batch_size * batch:batch_size * (batch+1)]
        
        # get indices for monitor data and actual monitor data
        y_by_site = []
        x_static_by_site = []
        y_ind_by_site = []
        for i in range(len(y_tuple_nans)):
            y_ind = get_monitorData_indices(y_tuple_nans[i])
            y_by_site.append(y_tuple_nans[i][y_ind])
            y_ind_by_site.append(y_ind)
            x_static_by_site.append(x_tuple_batch_static[i][y_ind])
        y_batch = list(Variable(torch.cat(y_by_site, dim=0)).data.numpy())
        x_batch_static = Variable(torch.cat(x_static_by_site, dim=0)).float()
        
        # get model output
        pred_batch = list(cnn(x_stack_batch_vary, x_batch_static, y_ind_by_site).data.numpy())
        
        # concatenate new predictions with ones from previous batches
        y += y_batch
        pred += pred_batch
        
    return sklearn.metrics.r2_score(y, pred)

In [19]:
### CNN model architecture
class CNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_static, kernel_size, padding):
        super(CNN, self).__init__()
        
        self.conv1d = torch.nn.Conv1d(in_channels=input_size, out_channels=hidden_size, kernel_size=kernel_size, padding=padding, bias=bias)
        self.norm1 = torch.nn.BatchNorm1d(num_features = hidden_size)
        self.tanh = torch.nn.Tanh()
        #self.norm2 = torch.nn.BatchNorm1d(num_features = hidden_size + num_static)
        self.linear = torch.nn.Linear(in_features = hidden_size + num_static, out_features = 1, bias = True)
        
    def forward(self, input_vary, input_static, y_ind_by_site):
        hidden = self.conv1d(input_vary)
        hidden = self.norm1(hidden)
        hidden = self.tanh(hidden)
        
        hidden_w_response = []
        
        for i in range(hidden.size()[0]):
            hidden_w_response.append(torch.transpose(hidden[i][:, y_ind_by_site[i]], 0, 1)) 
        hidden_w_response = torch.cat(hidden_w_response, dim = 0)
        
        hidden_w_response__input_static = torch.cat([hidden_w_response, input_static], dim = 1)
        
        #hidden = self.norm2(hidden_w_response__input_static)
        output = self.linear(hidden_w_response__input_static)

        return output

In [7]:
### train/val/test split by site id
np.random.seed(1)

# get sites for val/test data
val_test_sites = np.random.choice(np.unique(data_final['site'].values), round(len(np.unique(data_final['site'].values))/4), replace = False)

# get sites for test data
test_sites = np.random.choice(np.unique(val_test_sites), round(len(np.unique(val_test_sites))/2), replace = False)

# train sites/rows and x/y split
train = data_final[~data_final['site'].isin(val_test_sites)]
train_x = train.iloc[:, 4:]
train_y = train.loc[:, 'MonitorData']
train_sites = train.loc[:, 'site']

# val sites/rows and x/y split
val = data_final[(data_final['site'].isin(val_test_sites)) & (~data_final['site'].isin(test_sites))]
val_x = val.iloc[:, 4:]
val_y = val.loc[:, 'MonitorData']
val_sites = val.loc[:, 'site']

# test sites/rows and x/y split
test = data_final[data_final['site'].isin(test_sites)]
test_x = test.iloc[:, 4:]
test_y = test.loc[:, 'MonitorData']
test_sites = test.loc[:, 'site']

In [8]:
### impute mean
mean_imputer = sklearn.preprocessing.Imputer(strategy = 'mean')
train_x_imp = mean_imputer.fit_transform(train_x)
val_x_imp = mean_imputer.transform(val_x)
test_x_imp = mean_imputer.transform(test_x)

In [9]:
### standardize features
standardizer = sklearn.preprocessing.StandardScaler(with_mean = True, with_std = True)
train_x_imp_std = standardizer.fit_transform(train_x_imp)
val_x_imp_std = standardizer.transform(val_x_imp)
test_x_imp_std = standardizer.transform(test_x_imp)

In [10]:
### break x into varying and static variables
train_x_imp_std_vary = train_x_imp_std[:, :112]
train_x_imp_std_static = train_x_imp_std[:, 112:]

val_x_imp_std_vary = val_x_imp_std[:, :112]
val_x_imp_std_static = val_x_imp_std[:, 112:]

test_x_imp_std_vary = test_x_imp_std[:, :112]
test_x_imp_std_static = test_x_imp_std[:, 112:]

In [11]:
### get split sizes for TRAIN data (splitting by site)
train_split_sizes = split_sizes_site(train_sites.values)

### get tuples by site
train_x_std_tuple_vary = split_data(torch.from_numpy(train_x_imp_std_vary).float(), train_split_sizes, dim = 0)
train_x_std_tuple_static = split_data(torch.from_numpy(train_x_imp_std_static).float(), train_split_sizes, dim = 0)
train_y_tuple = split_data(torch.from_numpy(train_y.values), train_split_sizes, dim = 0)


### get site sequences stacked into matrix to go through CNN
train_x_std_stack_vary = pad_stack_splits(train_x_std_tuple_vary, np.array(train_split_sizes), 'x')
train_x_std_stack_vary = Variable(torch.transpose(train_x_std_stack_vary, 1, 2))


### get split sizes for VALIDATION data (splitting by site)
val_split_sizes = split_sizes_site(val_sites.values)

### get tuples by site
val_x_std_tuple_vary = split_data(torch.from_numpy(val_x_imp_std_vary).float(), val_split_sizes, dim = 0)
val_x_std_tuple_static = split_data(torch.from_numpy(val_x_imp_std_static).float(), val_split_sizes, dim = 0)
val_y_tuple = split_data(torch.from_numpy(val_y.values), val_split_sizes, dim = 0)

### get site sequences stacked into matrix to go through CNN
val_x_std_stack_vary = pad_stack_splits(val_x_std_tuple_vary, np.array(val_split_sizes), 'x')
val_x_std_stack_vary = Variable(torch.transpose(val_x_std_stack_vary, 1, 2))


### get split sizes for TEST data (splitting by site)
test_split_sizes = split_sizes_site(test_sites.values)

### get tuples by site
test_x_std_tuple_vary = split_data(torch.from_numpy(test_x_imp_std_vary).float(), test_split_sizes, dim = 0)
test_x_std_tuple_static = split_data(torch.from_numpy(test_x_imp_std_static).float(), test_split_sizes, dim = 0)
test_y_tuple = split_data(torch.from_numpy(test_y.values), test_split_sizes, dim = 0)

### get site sequences stacked into matrix to go through CNN
test_x_std_stack_vary = pad_stack_splits(test_x_std_tuple_vary, np.array(test_split_sizes), 'x')
test_x_std_stack_vary = Variable(torch.transpose(test_x_std_stack_vary, 1, 2))

In [22]:
# CNN parameters
input_size = train_x_imp_std_vary.shape[1]
hidden_size = 25
kernel_size = 3
num_static = train_x_imp_std_static.shape[1]
padding = 1
bias = True

# instantiate model
cnn = CNN(input_size, hidden_size, num_static, kernel_size, padding)

# Loss function
mse_loss = torch.nn.MSELoss(size_average=True)

# Optimizer
lr = 0.0000001
weight_decay = 0.000001
optimizer = torch.optim.SGD(cnn.parameters(), lr=lr, weight_decay=weight_decay)

In [23]:
num_epochs = 10000
batch_size = 15

# get number of batches
if train_x_std_stack_vary.size()[0] % batch_size != 0:
    num_batches = int(np.floor(train_x_std_stack_vary.size()[0]/batch_size) + 1)
else:
    num_batches = int(train_x_std_stack_vary.size()[0]/batch_size)
    
    
for epoch in range(num_epochs):
    epoch_loss = 0
    
    for batch in range(num_batches):
        # get x and y for this batch
        x_stack_batch_vary = train_x_std_stack_vary[batch_size * batch:batch_size * (batch+1)]
        x_tuple_batch_static = train_x_std_tuple_static[batch_size * batch:batch_size * (batch+1)]
        y_tuple_nans = train_y_tuple[batch_size * batch:batch_size * (batch+1)]
        
        # get indices for monitor data and actual monitor data
        y_by_site = []
        x_static_by_site = []
        y_ind_by_site = []
        for i in range(len(y_tuple_nans)):
            y_ind = get_monitorData_indices(y_tuple_nans[i])
            y_by_site.append(y_tuple_nans[i][y_ind])
            y_ind_by_site.append(y_ind)
            x_static_by_site.append(x_tuple_batch_static[i][y_ind])
        y_batch = Variable(torch.cat(y_by_site, dim=0)).float()
        x_batch_static = Variable(torch.cat(x_static_by_site, dim=0)).float()
        
        # get model output
        pred_batch = cnn(x_stack_batch_vary, x_batch_static, y_ind_by_site)
        
        # compute loss, backprop, and update parameters
        loss_batch = mse_loss(pred_batch, y_batch)
        loss_batch.backward()
        optimizer.step()
        
        # accumulate loss over epoch
        epoch_loss += loss_batch.data[0]
        
    print(r2(cnn, batch_size, val_x_std_stack_vary, val_x_std_tuple_static, val_y_tuple))
    print(epoch_loss)

-1.69481926025
2908.594757080078
-1.6943237561
2908.1898880004883
-1.69351382731
2907.3789291381836
-1.69239009562
2906.16357421875
-1.69095339252
2904.550163269043
-1.68920479359
2902.529571533203
-1.68714561185
2900.118309020996
-1.68477738276
2897.3028259277344
-1.68210189601
2894.0870666503906
-1.67912116219
2890.486671447754
-1.67583742446
2886.4955444335938
-1.6722532228
2882.1145401000977
-1.66837127985
2877.3556747436523
-1.66419461763
2872.213668823242
-1.65972648052
2866.7030868530273
-1.65497034021
2860.8148498535156
-1.64992994235
2854.5731201171875
-1.64460922955
2847.966537475586
-1.63901236758
2841.006446838379
-1.63314373044
2833.7011795043945
-1.62700785403
2826.05379486084
-1.62060941297
2818.0746536254883
-1.61395319563
2809.76171875
-1.60704405077
2801.1268768310547
-1.59988692087
2792.1767349243164
-1.59248668915
2782.9169845581055
-1.58484827492
2773.357681274414
-1.57697645767
2763.505500793457
-1.56887594687
2753.368453979492
-1.5605513148
2742.947090148926
-1.5

0.0559215188366
1068.2627754211426
0.0781977376728
1044.1876792907715
0.0998410163751
1020.5391273498535
0.12081149772
997.3627395629883
0.141073066363
974.6974105834961
0.160592982262
952.5884132385254
0.179341371299
931.0674571990967
0.197292404768
910.1729602813721
0.214422585974
889.9318923950195
0.23071211272
870.3746013641357
0.246144349788
851.5278396606445
0.260705688265
833.4106101989746
0.27438503201
816.046558380127
0.287174913554
799.4483985900879
0.299070056917
783.6330642700195
0.310068236807
768.6131210327148
0.32016994889
754.3951969146729
0.329377593975
740.9857368469238
0.337696271253
728.3894100189209
0.345133315432
716.6086730957031
0.351698180116
705.6421127319336
0.357402161285
695.4885158538818
0.362258531082
686.144947052002
0.366282247869
677.599853515625
0.369489947011
669.8493614196777
0.371899557936
662.8813953399658
0.373530641742
656.6858291625977
0.374403811984
651.2485790252686
0.374540949344
646.5565567016602
0.373964893532
642.5930309295654
0.372699450

0.406840539664
693.637861251831
0.402567270091
698.6285743713379
0.398187599372
703.6648693084717
0.393716764618
708.7292709350586
0.389169468214
713.8081150054932
0.384561060867
718.8838634490967
0.37990642891
723.9401702880859
0.375220616727
728.96364402771
0.370518222202
733.936653137207
0.365813733979
738.8446159362793
0.36112147599
743.6722621917725
0.356455222977
748.4061660766602
0.351828467118
753.0295791625977
0.347254510946
757.5324172973633
0.342745829038
761.8997421264648
0.338314445592
766.1174755096436
0.333971730881
770.1754894256592
0.329728826736
774.0616149902344
0.325595632138
777.7654094696045
0.321581615312
781.278980255127
0.317695316799
784.5894794464111
0.31394478388
787.6921787261963
0.310336834291
790.578218460083
0.306878290236
793.2406520843506
0.303574218934
795.6752433776855
0.300428970108
797.8769664764404
0.297446419177
799.8393249511719
0.294628977664
801.564432144165
0.291978715679
803.0477085113525
0.289496652558
804.2892818450928
0.287182576163
805.2

0.346443526785
634.0768947601318
0.343826086238
633.6427783966064
0.341268333001
633.1950340270996
0.338781625275
632.7229080200195
0.336377433013
632.2237319946289
0.334066491059
631.6909217834473
0.331858629363
631.1198425292969
0.329763761821
630.5030403137207
0.327790742582
629.8402576446533
0.325947980437
629.1244430541992
0.324242457478
628.3553485870361
0.322681139547
627.5307769775391
0.321269635371
626.6478290557861
0.320012708937
625.7084369659424
0.318914301328
624.7085800170898
0.317977165704
623.6540298461914
0.317202990273
622.5446929931641
0.316592569286
621.3839244842529
0.316145889152
620.1758556365967
0.315861681904
618.9219989776611
0.315737454107
617.633279800415
0.315770305453
616.3106460571289
0.315956289921
614.9628372192383
0.316289892032
613.5957317352295
0.31676524076
612.2188148498535
0.317375116522
610.8415813446045
0.318111845493
609.4724941253662
0.318967347917
608.1196956634521
0.319932383212
606.7963047027588
0.320997000862
605.5092754364014
0.3221506458

0.399492033583
626.490083694458
0.397300896281
629.6771793365479
0.395263571608
632.726095199585
0.393386152873
635.626392364502
0.39167463813
638.3729343414307
0.390131493324
640.953239440918
0.388759160142
643.3627796173096
0.387559831121
645.5941352844238
0.386534898428
647.6431751251221
0.385682575648
649.5061702728271
0.385001261118
651.1823291778564
0.38448887042
652.669319152832
0.384142602196
653.966926574707
0.383957213422
655.0774478912354
0.383927389424
656.0019607543945
0.384046766541
656.7458477020264
0.38430854043
657.3120975494385
0.384705147578
657.7070121765137
0.385228242041
657.9371948242188
0.385868784673
658.0093250274658
0.386617542325
657.9318408966064
0.387464479762
657.7136344909668
0.388399336116
657.365514755249
0.389411287834
656.8954677581787
0.390489734678
656.3151569366455
0.391623961864
655.6366729736328
0.392802544408
654.8683815002441
0.394014868365
654.0260639190674
0.395250224598
653.1184406280518
0.396497527008
652.1580448150635
0.397746601241
651.1

0.420060361542
639.8546314239502
0.425762195948
633.9043636322021
0.431319891657
628.0777549743652
0.436694998543
622.4160995483398
0.441850965844
616.9561309814453
0.446752519
611.7356414794922
0.451365919363
606.7921333312988
0.455661094045
602.1569423675537
0.459608782728
597.8601360321045
0.463184351337
593.9315376281738
0.466364702438
590.3916511535645
0.469131468213
587.2653789520264
0.471468121849
584.5680065155029
0.47336252923
582.3148937225342
0.474806275531
580.5171604156494
0.47579454869
579.1780872344971
0.476326123643
578.3033809661865
0.47640354108
577.891809463501
0.476033401689
577.9364471435547
0.475225572529
578.4299716949463
0.473993944576
579.3598136901855
0.472355495699
580.7096176147461
0.470330853681
582.4591121673584
0.467943530223
584.5869197845459
0.465220109676
587.0658626556396
0.462190030748
589.8687191009521
0.458884624786
592.9638538360596
0.455338034993
596.3165493011475
0.451586451874
599.8940124511719
0.447666693181
603.6561508178711
0.443617520313
60

0.388761944172
529.5723075866699
0.394904173041
525.2239284515381
0.400859784338
521.1417293548584
0.4065958165
517.3533325195312
0.412080441331
513.8867416381836
0.417284005776
510.7658386230469
0.422179417388
508.01086616516113
0.426742031704
505.6437530517578
0.430949027071
503.6807460784912
0.434781372157
502.1361026763916
0.43822263044
501.02125358581543
0.441258743285
500.3447799682617
0.443879045967
500.11300468444824
0.446076855738
500.3271255493164
0.44784609258
500.9878158569336
0.449185345256
502.0934028625488
0.450095666547
503.635311126709
0.450580894427
505.6065845489502
0.45064668652
507.995153427124
0.450304028443
510.7860698699951
0.449564061531
513.9640731811523
0.448440546927
517.5094451904297
0.446950113777
521.399974822998
0.445111758853
525.6129417419434
0.442945392381
530.1234931945801
0.440473282199
534.9037914276123
0.437719132042
539.9240589141846
0.434707877151
545.1582355499268
0.431465594469
550.5718383789062
0.428019191791
556.1345272064209
0.42439611744
5

0.432772423452
595.0142688751221
0.429989643293
596.735912322998
0.427024742332
598.5876407623291
0.423886775896
600.5644569396973
0.420585492105
602.6547193527222
0.417132747551
604.8483467102051
0.413540834953
607.1381120681763
0.409822974714
609.5071725845337
0.405993973892
611.9443130493164
0.402068825202
614.4353380203247
0.398062826784
616.9687728881836
0.393992494928
619.5276126861572
0.389875267012
622.0972099304199
0.385728660465
624.6595773696899
0.381569729341
627.2029552459717
0.377416884556
629.7092638015747
0.37328857001
632.1625213623047
0.369202211407
634.5451345443726
0.365175504305
636.8448905944824
0.361228184516
639.0436878204346
0.357376326886
641.1244239807129
0.35363806778
643.0765533447266
0.350029492061
644.8842306137085
0.34656706557
646.5332355499268
0.34326622412
648.0118217468262
0.340141341906
649.305643081665
0.337206912307
650.4070510864258
0.334475490182
651.3039150238037
0.331960014705
651.9870719909668
0.329670149547
652.4494018554688
0.327615570152
6

0.343394693841
595.1516036987305
0.339723964158
597.2352724075317
0.336012937028
599.3911724090576
0.332285700252
601.5979948043823
0.32856457618
603.8323078155518
0.324869073513
606.0761833190918
0.321220055176
608.3051929473877
0.317635927507
610.499584197998
0.314138381122
612.639570236206
0.310745236936
614.7034692764282
0.307473195711
616.6755933761597
0.304338511506
618.5368022918701
0.301357130107
620.2689819335938
0.298543276133
621.8583993911743
0.295909069097
623.2878332138062
0.293465008672
624.5471000671387
0.291220367194
625.6227741241455
0.289183864334
626.5056581497192
0.287360910845
627.1868371963501
0.285755669492
627.661602973938
0.284371232378
627.9236717224121
0.283209169711
627.969612121582
0.28226999884
627.7989864349365
0.281551313853
627.4101486206055
0.281049172821
626.8054866790771
0.280758573348
625.9891471862793
0.280672100505
624.966178894043
0.280781872952
623.7444877624512
0.281077731255
622.3334045410156
0.28155045468
620.7391471862793
0.282186754131
618

0.336551552855
601.8184661865234
0.334633690189
603.5866413116455
0.33309586206
605.0364017486572
0.331942385525
606.1632614135742
0.331174711883
606.9665622711182
0.330789708024
607.4485111236572
0.330784017258
607.6112422943115
0.331151442414
607.4606189727783
0.331885055251
607.0012607574463
0.332975371044
606.2425651550293
0.334409835078
605.1950855255127
0.336175813728
603.8719749450684
0.338257663656
602.2855033874512
0.340639927273
600.4506645202637
0.343303723248
598.382848739624
0.346230449082
596.1020832061768
0.349398727159
593.6233291625977
0.352786903724
590.9693794250488
0.356371708687
588.1613140106201
0.360129886373
585.2202072143555
0.364033138325
582.1693019866943
0.368059690742
579.0299053192139
0.372183493825
575.8263568878174
0.376380061884
572.5795764923096
0.380623400817
569.3146553039551
0.384885282341
566.0552711486816
0.389140603695
562.8232612609863
0.393367785479
559.6409816741943
0.397537188591
556.5324401855469
0.401626654771
553.5172538757324
0.4056128765

0.415314998873
612.444314956665
0.417812827386
609.9189758300781
0.420317789658
607.283332824707
0.422821359059
604.5523662567139
0.425314719208
601.733455657959
0.427787865552
598.8404083251953
0.430230293671
595.8880786895752
0.432631689017
592.8884048461914
0.434981625175
589.8566055297852
0.437268712153
586.8072547912598
0.439481471117
583.7520294189453
0.441608614671
580.709451675415
0.443638882701
577.6938896179199
0.445560320702
574.718469619751
0.447362162473
571.8004722595215
0.449032862915
568.9526863098145
0.450561231513
566.1920757293701
0.451936494368
563.5308322906494
0.453148429498
560.9854316711426
0.454186859429
558.5688781738281
0.455042274324
556.2933979034424
0.455705886817
554.1708641052246
0.456169728441
552.2153720855713
0.456425452843
550.436466217041
0.456466794917
548.8446960449219
0.45628634039
547.4502239227295
0.455880301176
546.2595405578613
0.455244268446
545.2835292816162
0.454372645792
544.5252742767334
0.453265221222
543.9920310974121
0.451920397873
54

0.39366601937
600.1837120056152
0.394230418009
600.5990581512451
0.394919205798
600.7861995697021
0.395727810422
600.7424488067627
0.396648086487
600.4767761230469
0.397674504774
599.9935970306396
0.398799749866
599.2969303131104
0.400014156748
598.3940696716309
0.401309949022
597.2962446212769
0.402678295283
596.0105514526367
0.404108786634
594.545693397522
0.405591463688
592.9161005020142
0.407115325246
591.1289358139038
0.408670312778
589.2012605667114
0.410245188743
587.1408271789551
0.411829629034
584.9644927978516
0.413411568089
582.6836824417114
0.414981039707
580.3159551620483
0.41652598323
577.8708353042603
0.418035514221
575.3657207489014
0.419499725162
572.8141794204712
0.42090673206
570.2321720123291
0.422246833442
567.6336641311646
0.423510927642
565.0326242446899
0.424688516941
562.4435834884644
0.425770677
559.8814153671265
0.426749266282
557.3602313995361
0.427616711384
554.8913288116455
0.428365297438
552.4874143600464
0.428988213659
550.1625108718872
0.429479608783
54

0.336553347519
599.6074466705322
0.337210202692
596.9518871307373
0.338009385394
594.1581363677979
0.338944593684
591.2342510223389
0.340009388284
588.1826324462891
0.341196649394
585.0152072906494
0.342497798707
581.7400245666504
0.343903780692
578.3693504333496
0.345404376729
574.9119577407837
0.346988880148
571.3817911148071
0.348648560446
567.7886228561401
0.350370970252
564.1462297439575
0.352145442483
560.463376045227
0.353959024995
556.7571315765381
0.355798046622
553.0395994186401
0.357650973743
549.3245067596436
0.359504620827
545.6284332275391
0.361345889121
541.9609670639038
0.363158786492
538.3390913009644
0.364935151056
534.7786159515381
0.366658436067
531.2913513183594
0.368314724567
527.8893117904663
0.369894187564
524.5911636352539
0.371381497182
521.4077806472778
0.372767063279
518.3526458740234
0.374037141316
515.4390325546265
0.375180989848
512.6767673492432
0.376187753243
510.0837240219116
0.377046871037
507.66470527648926
0.377747790308
505.43588638305664
0.3782843

0.317047717868
556.6903285980225
0.310508854779
561.2270584106445
0.304195487031
565.6541690826416
0.298127210694
569.9550933837891
0.292322729488
574.1171817779541
0.286798112416
578.124885559082
0.28157099322
581.9673690795898
0.276653973398
585.6319847106934
0.272061818346
589.1073017120361
0.267807228012
592.3836822509766
0.263902441928
595.4523181915283
0.260358308916
598.3020858764648
0.257181259479
600.9307460784912
0.25438050543
603.3292236328125
0.251960419564
605.494270324707
0.249926211569
607.4207916259766
0.248280064122
609.1094417572021
0.247024026038
610.554573059082
0.246160182453
611.7595500946045
0.245684518581
612.7238330841064
0.245597862181
613.4496955871582
0.245893544826
613.938985824585
0.246567871076
614.1964569091797
0.247613988027
614.2277660369873
0.249024484241
614.0386142730713
0.250788924387
613.6383838653564
0.252899076677
613.0307769775391
0.25534392972
612.2276592254639
0.258110687021
611.237060546875
0.261185208425
610.0715312957764
0.264552919709
608

0.376328153604
636.3499279022217
0.378442095115
635.5934085845947
0.380550205668
634.7505388259888
0.382657232622
633.8107089996338
0.384767195438
632.772910118103
0.386884402946
631.6285638809204
0.389013126085
630.3767862319946
0.391156513659
629.0121898651123
0.393318978139
627.5300254821777
0.39550426356
625.9282503128052
0.397715750225
624.2039222717285
0.399954884417
622.354082107544
0.402225281124
620.3811635971069
0.404529408406
618.278694152832
0.406868646578
616.0507516860962
0.409243539402
613.6969747543335
0.411655545675
611.2182731628418
0.414104395541
608.615797996521
0.41659089245
605.8939580917358
0.419114538475
603.0528812408447
0.421673007507
600.1002740859985
0.424264754563
597.0391473770142
0.426888073525
593.8749923706055
0.429539553748
590.6134443283081
0.432217102927
587.2608337402344
0.43491588864
583.8242444992065
0.437632122206
580.3137483596802
0.440360390881
576.7359743118286
0.44309543024
573.1006717681885
0.445831917808
569.418683052063
0.448563768826
565.

0.523021294837
502.8356046676636
0.522393398191
506.427773475647
0.52159814137
510.1982822418213
0.520643324937
514.1308374404907
0.519537010691
518.2117214202881
0.518287174373
522.4198026657104
0.516902774517
526.7417812347412
0.515392518318
531.1581735610962
0.513765883723
535.6509027481079
0.512032535742
540.2022504806519
0.51020158754
544.7943649291992
0.508283104974
549.407527923584
0.506286551794
554.0247421264648
0.504221699531
558.6259784698486
0.502098300452
563.1972017288208
0.499925311716
567.7158899307251
0.497712998765
572.1680335998535
0.495469088489
576.5368204116821
0.493203025252
580.802981376648
0.490923267697
584.9540557861328
0.488637755748
588.9743041992188
0.486353936319
592.8471584320068
0.484078686685
596.5626029968262
0.481818766794
600.1077785491943
0.479580318918
603.467170715332
0.477368757925
606.6338272094727
0.475188895167
609.5980167388916
0.473044920796
612.3508949279785
0.47094024116
614.8825340270996
0.468878305223
617.1920757293701
0.466860921114
61

0.401178298312
590.2386808395386
0.400284025593
589.7558650970459
0.399382196598
589.3114585876465
0.398468682655
588.8963270187378
0.397548589145
588.5157632827759
0.396629109278
588.1635799407959
0.395716784533
587.8379926681519
0.394829216644
587.5303268432617
0.393966319679
587.2327175140381
0.393132555747
586.9443531036377
0.392335320758
586.6618089675903
0.391595112697
586.3753576278687
0.390909801442
586.0738334655762
0.39028474691
585.7571868896484
0.389735291237
585.4199123382568
0.389268356202
585.0479011535645
0.388883975424
584.6382274627686
0.388597051122
584.188404083252
0.38841418571
583.6817588806152
0.388333358107
583.1190023422241
0.38837245092
582.4938144683838
0.388531234453
581.7931594848633
0.38880829165
581.0176668167114
0.389224124872
580.1616077423096
0.389769051524
579.2127494812012
0.390451195393
578.1764583587646
0.391277202962
577.0356140136719
0.392237165581
575.7969579696655
0.393348680203
574.4575252532959
0.394599493134
573.0029211044312
0.395994268915


0.350348265012
552.7344074249268
0.342581985858
556.6521759033203
0.334954858195
560.5181407928467
0.32752305538
564.3045444488525
0.320298660356
567.9811668395996
0.313327228405
571.5354442596436
0.306626102373
574.9342269897461
0.300234307568
578.1665210723877
0.294166453167
581.204122543335
0.288455864569
584.041597366333
0.283114187018
586.6506633758545
0.278170502794
589.0306644439697
0.273631906165
591.1582984924316
0.26952448131
593.032751083374
0.265843410158
594.6414413452148
0.262616974138
595.9848117828369
0.259831666922
597.0538692474365
0.257512840239
597.853120803833
0.255638572504
598.3814525604248
0.254228864102
598.6418914794922
0.253262205346
598.6456069946289
0.252743718219
598.3919296264648
0.252652837791
597.9019145965576
0.252989876798
597.1773052215576
0.25372758267
596.2408828735352
0.254865840945
595.1009159088135
0.256365073537
593.7782154083252
0.258233039227
592.2917098999023
0.260422670083
590.6574459075928
0.262931055132
588.9001178741455
0.265720123126
58

0.445110352361
579.1131210327148
0.447384889248
574.7436256408691
0.449681843863
570.28928565979
0.451978883194
565.7788276672363
0.454251917954
561.2351779937744
0.456473526732
556.6982879638672
0.458620416433
552.1997375488281
0.460669535509
547.767972946167
0.462600635544
543.4276809692383
0.464389482047
539.2116565704346
0.466017112503
535.1487064361572
0.467464919598
531.2640247344971
0.468716996861
527.5821704864502
0.469758271528
524.1226596832275
0.470576622887
520.9091730117798
0.471161533022
517.9549264907837
0.471504701298
515.2783555984497
0.471600576856
512.890417098999
0.471445481733
510.80183029174805
0.471038353936
509.0208044052124
0.470381558927
507.5523509979248
0.469478092073
506.3951225280762
0.468334720369
505.551474571228
0.466959616434
505.0162305831909
0.465364652739
504.78464221954346
0.463563549481
504.84607696533203
0.461576278967
505.1891784667969
0.459421755717
505.79373359680176
0.457113558401
506.6480550765991
0.454672231303
507.73412895202637
0.45212000

0.471992881232
493.3701992034912
0.469324835447
495.6852207183838
0.466485611324
498.17792797088623
0.463488575999
500.84019947052
0.460347477889
503.6585626602173
0.457090353296
506.61863708496094
0.453726351706
509.69741916656494
0.450267522514
512.8898906707764
0.446731199246
516.1802101135254
0.443144350746
519.5521392822266
0.439516288487
522.9854345321655
0.43585674004
526.4714994430542
0.432200377404
529.9952087402344
0.428550269518
533.5340347290039
0.424918133844
537.0810928344727
0.421335041281
540.6228342056274
0.417805767095
544.1343603134155
0.414343464755
547.6154260635376
0.410976105614
551.0411243438721
0.407697916243
554.4038734436035
0.404537319316
557.6960802078247
0.401499617755
560.8944425582886
0.398592021251
563.999345779419
0.395840392115
566.9933738708496
0.393234153507
569.8683776855469
0.390799954527
572.6196575164795
0.388535319763
575.2318859100342
0.386449050586
577.706916809082
0.384552531377
580.026050567627
0.382834787331
582.1955795288086
0.38132288617

0.49908542086
584.9869766235352
0.498079152716
586.6188945770264
0.496999032951
588.1871719360352
0.49585126896
589.6839752197266
0.494640590978
591.1018905639648
0.493372712433
592.4369926452637
0.492054341742
593.6793403625488
0.490690382261
594.8210258483887
0.489287849616
595.8601970672607
0.4878530623
596.7855052947998
0.486391789472
597.5934352874756
0.484913592371
598.2779140472412
0.483427048946
598.8273658752441
0.48193415978
599.2403964996338
0.480443055473
599.5137462615967
0.478958880519
599.643762588501
0.477489357208
599.6237316131592
0.476047971369
599.4473285675049
0.474633105842
599.1129703521729
0.473250866457
598.6194038391113
0.471907620195
597.9662170410156
0.470614155152
597.149393081665
0.469375139762
596.1635780334473
0.468189639283
595.0140113830566
0.467063090144
593.70263671875
0.466007778362
592.2275218963623
0.465024575489
590.5853290557861
0.464111324293
588.7842712402344
0.463277399966
586.8286762237549
0.462528267862
584.7128658294678
0.461858304065
582.

0.478228077967
491.2125463485718
0.479074710813
488.68009090423584
0.47974785723
486.3906316757202
0.480243864077
484.3497486114502
0.480558876186
482.56701278686523
0.480691674448
481.0421886444092
0.480639708585
479.78339862823486
0.480403038259
478.7904510498047
0.479981270459
478.0656051635742
0.479373914644
477.6091070175171
0.478584300768
477.4214162826538
0.477612330748
477.49949073791504
0.476462457214
477.8416919708252
0.475137680853
478.44432735443115
0.473640997905
479.3009376525879
0.471970936654
480.4033031463623
0.470141279086
481.74785900115967
0.468156972326
483.32631397247314
0.466025877188
485.1314105987549
0.463754798548
487.1536560058594
0.461351113278
489.3822717666626
0.458822624494
491.8065137863159
0.456172841589
494.4117126464844
0.453416712442
497.18743991851807
0.450563155403
500.1211271286011
0.447622059704
503.20112133026123
0.444602743569
506.4124364852905
0.441512510512
509.7401342391968
0.438363660736
513.1662855148315
0.43516732215
516.6806535720825
0.4

0.371329602208
581.3083629608154
0.370079615463
582.5199184417725
0.368888487039
583.6504802703857
0.367755480189
584.705436706543
0.366680042963
585.6854858398438
0.365660204174
586.5970439910889
0.364695591651
587.4384956359863
0.36378460893
588.2148494720459
0.36292647634
588.9257183074951
0.362118792743
589.5774173736572
0.361360648003
590.1700191497803
0.360651479629
590.7086029052734
0.35998897307
591.1946697235107
0.359372484743
591.6309661865234
0.3588015247
592.0223751068115
0.358274764075
592.3690013885498
0.357791364474
592.6744556427002
0.357348957304
592.9431324005127
0.356948420256
593.1759052276611
0.35658955436
593.3741836547852
0.35627044957
593.5438537597656
0.355991865819
593.6830463409424
0.355752709556
593.7975921630859
0.355553083707
593.886962890625
0.35539338164
593.953685760498
0.355280202069
593.9974708557129
0.355213772413
594.0185775756836
0.35518704158
594.0184192657471
0.355203195078
593.999345779419
0.355260544992
593.963680267334
0.355360999095
593.91063

KeyboardInterrupt: 

In [736]:
train_x = train_x.loc[train_y.dropna(axis = 0).index, :]
test_x = test_x.loc[test_y.dropna(axis = 0).index, :]

### impute mean
mean_imputer = sklearn.preprocessing.Imputer(strategy = 'mean')
train_x_imp = mean_imputer.fit_transform(train_x)
test_x_imp = mean_imputer.transform(test_x)

train_x_imp_std = standardizer.fit_transform(train_x_imp)
train_y = train_y.dropna(axis = 0).values

test_x_imp_std = standardizer.transform(test_x_imp)
test_y = test_y.dropna(axis = 0).values

In [738]:
import sklearn.preprocessing
import sklearn.linear_model
import sklearn.model_selection
import sklearn.metrics
import sklearn.ensemble

# get ridge coefficients
ridge = sklearn.linear_model.Ridge(alpha = 1000, random_state = 1)
ridge.fit(train_x_imp_std, train_y)
ridge.score(test_x_imp_std, test_y)

0.67064558617874481