In [134]:
import pandas as pd
import numpy as np
import sklearn.preprocessing
import torch
from torch.autograd import Variable

In [10]:
### read in 1st X rows of big data
data = pd.read_csv('assembled_data.csv', nrows = 50000)
data.head()

Unnamed: 0,site,year,date,MonitorData,GFEDFireCarbon,USElevation_dsc10000,USElevation_max100,USElevation_max10000,USElevation_mea100,USElevation_mea10000,...,Nearby_Peak2Lag3_MeanTemperature,Nearby_Peak2Lag3_MinTemperature,OMAEROe_UVAerosolIndex_Mean,OMAEROe_VISAerosolIndex_Mean,OMAERUVd_UVAerosolIndex_Mean,OMNO2d_ColumnAmountNO2StratoCloudScreened_Mean,OMO3PR,OMSO2e_ColumnAmountSO2_PBL_Mean,OMTO3e_ColumnAmountO3,OMUVBd_UVindex_Mean
0,1,2000,2000-01-01,,0.001167,26.790501,43,30.143499,36,26.504299,...,286.112711,280.293551,,,,,,,,
1,1,2000,2000-01-02,,0.001236,26.790501,43,30.143499,36,26.504299,...,286.112711,280.293551,,,,,,,,
2,1,2000,2000-01-03,,0.001305,26.790501,43,30.143499,36,26.504299,...,286.112711,280.293551,,,,,,,,
3,1,2000,2000-01-04,,0.001373,26.790501,43,30.143499,36,26.504299,...,286.112711,280.293551,,,,,,,,
4,1,2000,2000-01-05,,0.001442,26.790501,43,30.143499,36,26.504299,...,290.424271,286.541158,,,,,,,,


In [18]:
### train/val/test split by site id
np.random.seed(1)

# get sites for val/test data
val_test_sites = np.random.choice(np.unique(data['site'].values), round(len(np.unique(data['site'].values))/2), replace = False)

# get sites for test data
test_sites = np.random.choice(np.unique(val_test_sites), round(len(np.unique(val_test_sites))/2), replace = False)

# train sites/rows and x/y split
train = data[~data['site'].isin(val_test_sites)]
train_x = train.iloc[:, 4:]
train_y = train.loc[:, 'MonitorData']
train_sites = train.loc[:, 'site']

# val sites/rows and x/y split
val = data[(data['site'].isin(val_test_sites)) & (~data['site'].isin(test_sites))]
val_x = val.iloc[:, 4:]
val_y = val.loc[:, 'MonitorData']
val_sites = val.loc[:, 'site']

# test sites/rows and x/y split
test = data[data['site'].isin(test_sites)]
test_x = test.iloc[:, 4:]
test_y = test.loc[:, 'MonitorData']
val_sites = test.loc[:, 'site']

In [23]:
### impute mean
mean_imputer = sklearn.preprocessing.Imputer(strategy = 'mean')
train_x_imp = mean_imputer.fit_transform(train_x)
val_x_imp = mean_imputer.transform(val_x)
test_x_imp = mean_imputer.transform(test_x)

In [24]:
### standardize features
standardizer = sklearn.preprocessing.StandardScaler(with_mean = True, with_std = True)
train_x_imp_std = standardizer.fit_transform(train_x_imp)
val_x_imp_std = standardizer.transform(val_x_imp)
test_x_imp_std = standardizer.transform(test_x_imp)

In [320]:
def split_sizes_site(sites):
    """Gets the split sizes to split dataset by site for a dataset with multiple sites.
    
    Arguments:
        sites (array): array indicating the site of each row 
    """
    split_sizes = []
    for i in range(len(sites)):
        if i == 0:
            site = sites[i]
            split_sizes.append(i)
        elif site != sites[i]:
            site = sites[i]
            split_sizes.append(i - (len(split_sizes)-1)*split_sizes[len(split_sizes)-1])
        elif i == len(sites)-1:
            split_sizes.append((i+1) - (len(split_sizes)-1)*split_sizes[len(split_sizes)-1])
    
    split_sizes = split_sizes[1:]
    return split_sizes


def split_data(tensor, split_sizes, dim=0):
    """Splits the tensor according to chunks of split_sizes.
    
    Arguments:
        tensor (Tensor): tensor to split.
        split_sizes (list(int)): sizes of chunks
        dim (int): dimension along which to split the tensor.
    """
    if dim < 0:
        dim += tensor.dim()
    
    dim_size = tensor.size(dim)
    if dim_size != torch.sum(torch.Tensor(split_sizes)):
        raise KeyError("Sum of split sizes exceeds tensor dim")
    
    splits = torch.cumsum(torch.Tensor([0] + split_sizes), dim=0)[:-1]
    return tuple(tensor.narrow(int(dim), int(start), int(length)) 
        for start, length in zip(splits, split_sizes))


def pad_stack_splits(site_tuple, split_sizes, x_or_y):
    """Zero (x) or nan (y) pads site data sequences and stacks them into a matrix.
    
    Arguments:
        site_tuple (tuple): tuple of site data sequences to pad and stack
        split_sizes (array): lengths of site data sequences
        x_or_y (string): 'x' or 'y' indicating whether to pad and stack x or y
    """
    data_padded_list = []
    for sequence in site_tuple:
        max_sequence_length = torch.max(torch.from_numpy(split_sizes))

        if x_or_y == 'x':
            zero_padding_rows = torch.zeros(max_sequence_length - sequence.size()[0], sequence.size()[1])
            data_padded_list.append(torch.cat((sequence, zero_padding_rows), dim = 0))
            
        elif x_or_y == 'y':
            nan_padding = torch.zeros(max_sequence_length - sequence.size()[0]).double() * np.nan
            data_padded_list.append(torch.cat((sequence, nan_padding), dim = 0))
            
    return torch.stack(data_padded_list, dim = 0)


def get_monitorData_indices(sequence):
    """Gets indices for a site data sequence for which there is an output for MonitorData.
    
    Arguments:
        sequence (tensor): sequence of MonitorData outputs for a given site, including NaNs
    """
    response_indicator_vec = sequence == sequence
    num_responses = torch.sum(response_indicator_vec)
    response_indices = torch.sort(response_indicator_vec, dim = 0, descending = True)[1][:num_responses]
    ordered_response_indices = torch.sort(response_indices)[0]
    return ordered_response_indices

In [402]:
### get split sizes for training data (splitting by site)
split_sizes = split_sizes_site(train_sites.values[:-2])

### get tuples by site
train_x_std_tuple = size_splits(torch.from_numpy(train_x_imp_std).float()[:-2, :], split_sizes, dim = 0)
train_y_tuple = size_splits(torch.from_numpy(train_y.values[:-2]), split_sizes, dim = 0)

### get site sequences stacked into matrix to go through CNN
train_x_std_stack = pad_stack_splits(train_x_std_tuple, np.array(split_sizes), 'x')
train_x_std_stack = Variable(torch.transpose(train_x_std_stack, 1, 2))

### get indices for which there is a monitorData response for each site
monitorData_ind_by_site = []
for i in range(len(train_y_tuple)):
    monitorData_ind_by_site.append(get_monitorData_indices(train_y_tuple[i]))

In [417]:
# CNN parameters
input_size = train_x_imp_std.shape[1]
hidden_size = 100
kernel_size = 3
padding = 1
bias = True

class CNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, kernel_size, padding):
        super(CNN, self).__init__()

        self.conv1d = torch.nn.Conv1d(in_channels=input_size, out_channels=hidden_size, kernel_size=kernel_size, padding=padding, bias=bias)
        self.norm1 = torch.nn.BatchNorm1d(num_features = hidden_size)
        self.tanh = torch.nn.Tanh()
        self.linear = torch.nn.Linear(in_features = hidden_size, out_features = 1, bias = True)
        
    def forward(self, input, monitorData_ind_by_site):

        hidden = self.conv1d(input)
        hidden = self.norm1(hidden)
        hidden = self.tanh(hidden)
        
        hidden_w_response = []
        for i in range(hidden.size()[0]):
            hidden_w_response.append(torch.transpose(hidden[i][:, monitorData_ind_by_site[i]], 0, 1)) 
        hidden_w_response = torch.cat(hidden_w_response, dim = 0)
        
        output = self.linear(hidden_w_response)

        return output


# Loss function
loss_function = torch.nn.MSELoss(size_average=True)

cnn = CNN(input_size, hidden_size, kernel_size, padding)

# Optimizer
lr = 0.01
weight_decay = 0.000001
optimizer = torch.optim.Adam(cnn.parameters(), lr=lr, weight_decay=weight_decay)

In [416]:
cnn.forward(train_x_std_stack, monitorData_ind_by_site)

Variable containing:
-0.3727
-0.2432
-0.3099
   ⋮    
-0.1612
-0.1302
 0.1693
[torch.FloatTensor of size 9922x1]

AttributeError: 'tuple' object has no attribute 'size'