In [195]:
import pandas as pd
import json
import numpy as np
from sklearn import ensemble

import torch
from torch import nn, optim, sigmoid
from torch.utils.data import Dataset, DataLoader
import random

In [196]:
PATH = '/Users/garethdavies/Development/machinelearning/kaggle/taxi/pkdd-15-predict-taxi-service-trajectory-i/'
#train = pd.read_csv(PATH + 'train.csv')
train = pd.read_csv(PATH + 'train.csv', 
                 sep = ",",
                 chunksize = 100000,
                 iterator = True,
                 usecols = ['TRIP_ID','POLYLINE'],
                 converters={'POLYLINE': lambda x: json.loads(x)})

#sample = pd.read_csv(PATH + 'sampleSubmission.csv')

#train = train.head(100)

In [197]:
lat_mean = 41.15731
lat_std = 0.074120656
long_mean = -8.6161413
long_std = 0.057200309

In [198]:
def generate_targets(row):
    coord = row['POLYLINE'][-1]
    return [coord[1], coord[0]] if len(row['POLYLINE']) > 0 else None

def normalise(row):
    return [[(coord[1] - lat_mean)/lat_std, (coord[0] - long_mean)/long_std] for coord in row['POLYLINE'][:-1]]

In [28]:
trn = pd.DataFrame()
val = pd.DataFrame()

for chunk in train:
    chunk = chunk[chunk['POLYLINE'].map(lambda d: len(d)) > 0]    
    chunk['TARGET'] = chunk.apply(generate_targets, axis=1)
    chunk['POLYLINE'] = chunk.apply(normalise, axis=1)
    chunk = chunk.dropna()
    
    if random.uniform(0,1) < 0.8:
        trn = trn.append(chunk)
    else:
        val = val.append(chunk)
    
trn.to_csv(PATH + 'trn.csv')
val.to_csv(PATH + 'val.csv')


In [199]:
rearth = 6371
deg2rad = (3.141592653589793 / 180)

class ERDist_Loss(torch.nn.Module):
    def __init__(self):
        super(ERDist_Loss,self).__init__()
        
    def forward(self, a, b):
        lat1 = a[:, 0] * deg2rad
        lon1 = a[:, 1] * deg2rad
        lat2 = b[:, 0] * deg2rad
        lon2 = b[:, 1] * deg2rad
        x = (lon2-lon1) * torch.cos((lat1+lat2)/2)
        y = (lat2-lat1)
        ret = torch.sqrt(x**2 + y**2) * rearth
        return torch.sum(ret)

In [200]:
y_hat = torch.Tensor([[41.1489, -8.6166],[41.1517, -8.6155]])
y = torch.Tensor([[41.1442, -8.6421],[41.1455, -8.6075]])

In [201]:
class SequenceDataset(Dataset):
    def __init__(self, df, target, cont_vars):
        #df = df.dropna()
        df = df[df['POLYLINE'].map(lambda d: len(d)) > 0]
        self.df, self.target, self.cont_vars = df, target, cont_vars

    def __getitem__(self, index):
        record = self.df.iloc[index]
        conts = record[self.cont_vars]
        target = json.loads(record[self.target])
        return torch.Tensor(conts), target, index

    def __len__(self):
        return len(self.df)

In [202]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
class KEncoder(nn.Module):
    def __init__(self, cont_count, hidden_dim, n_layers, bidirectional, dropout,bs, directions):
        super().__init__()
        self.n_layers = n_layers
        self.cont_count = cont_count
        self.bs = bs
        self.hidden_dim = hidden_dim
        self.rnn = nn.LSTM(cont_count, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.directions = directions
        self.initHidden()
        print(self)

    def forward(self, seq, seq_lengths):
        #seq = seq.transpose(1,0)
        #print(seq_lengths)
        packed_input = pack_padded_sequence(seq, seq_lengths)
        packed_output, (hidden, cell)= self.rnn(packed_input)
        output, _ = pad_packed_sequence(packed_output)
        
        idx = torch.arange(0, len(seq_lengths)).long()
        last_cell_idx = torch.LongTensor(seq_lengths) -1
        output = output[last_cell_idx, idx, :]
        
        return output, (hidden, cell)

    def initHidden(self):
        self.state = (torch.zeros(self.directions*self.n_layers,self.bs, self.cont_count),
                  torch.zeros(self.directions*self.n_layers,self.bs, self.cont_count))

    def repackage_var(self, state):
        return state.data if type(state) == torch.Tensor else tuple(self.repackage_var(v) for v in state)
    
    @property
    def output_dim(self):
        return self.hidden_dim * self.directions
    
class KRNN(nn.Module):
    
    def __init__(self, encoder, output_dim):
        super().__init__()
        self.encoder = encoder
        self.fc = nn.Linear(encoder.output_dim, output_dim)
        
    def forward(self, seq, seq_lengths): 
        output, (hidden, cell) = self.encoder(seq, seq_lengths)
        output = self.fc(output)
        return output
    
class Cluster(nn.Module):
    def __init__(self, encoder, output_dim):
        super().__init__()
        self.encoder = encoder
        self.fc = nn.Linear(encoder.output_dim, output_dim)
        self.softmax = nn.Softmax()

    def forward(self, seq, seq_lengths): 
        output, (hidden, cell) = self.encoder(seq, seq_lengths)
        output = self.fc(output)
        output = self.softmax(output)
        return output

In [203]:
cont_count = 2
hidden_dim = 500
output_dim = 2
n_layers = 2
bidirectional=False
dropout=0.0
bs=200
directions=1

In [204]:
encoder = KEncoder(cont_count, hidden_dim, n_layers, bidirectional, dropout, bs, directions)
model = KRNN(encoder, output_dim)
optimizer = optim.Adam(model.parameters(), lr=0.03)
criterion = nn.MSELoss()

def train(model, iterator, optimizer, criterion):
    model.train()

    count = len(iterator)
    epoch_loss = 0
    epoch_acc = 0

    for sequence, target, seq_length in iterator:
        optimizer.zero_grad()
        last_pred = model(sequence, seq_length)
        print('LAST PRED', last_pred.size())
        loss = criterion(last_pred, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / count

KEncoder(
  (rnn): LSTM(2, 500, num_layers=2)
)


In [205]:
CLUSTER_PATH = '/Users/garethdavies/Development/machinelearning/kaggle/taxi/pkdd-15-predict-taxi-service-trajectory-i/'
ARRIVALS = CLUSTER_PATH + 'arrivals.pkl'
ARRIVAL_CLUSTERS = CLUSTER_PATH + 'arrival-clusters.pkl'

import pickle

with open(ARRIVALS, 'rb') as f:
    arr = pickle.load(f, encoding='latin1')
    
with open(ARRIVAL_CLUSTERS, 'rb') as f:
    arr_clusters = pickle.load(f, encoding='latin1')

In [222]:
encoder = KEncoder(cont_count, hidden_dim, n_layers, bidirectional, dropout, bs, directions)
model = Cluster(encoder, arr_clusters.shape[0])
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = ERDist_Loss()

arr_clusters.shape[0]
torch_clusters = torch.from_numpy(arr_clusters).transpose(1,0)
torch_clusters = torch_clusters[:, None, :]

def train(model, iterator, optimizer, criterion):
    model.train()

    count = len(iterator)
    epoch_loss = 0
    epoch_acc = 0

    for sequence, target, seq_length in iterator:
        optimizer.zero_grad()
        last_pred = model(sequence, seq_length)
        probable_locations = last_pred*torch_clusters
        probable_locations = probable_locations.transpose(1,0)
        weighted_avg = probable_locations.sum(dim=2)
        
        #print('PRED', weighted_avg)
        #print('Y', target)
        loss = criterion(weighted_avg, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / count

KEncoder(
  (rnn): LSTM(2, 500, num_layers=2)
)


In [207]:
def pad_sequence(batch):
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    sequences, labels, indexes = zip(*batch)    
    lengths = torch.LongTensor([sequence.size()[0] for sequence in sequences])
    sequences = torch.nn.utils.rnn.pad_sequence(sequences)
    labels = torch.Tensor(labels)
    return sequences, labels, lengths

In [209]:
BATCH_SIZE = 200
seq = pd.read_csv(PATH + 'trn.csv', 
                 sep = ",",
                 chunksize = 1000,
                 iterator = True,
                 usecols = ['TRIP_ID','POLYLINE', 'TARGET'],
                 converters={'POLYLINE': lambda x: json.loads(x)})


In [210]:
for chunk in seq:   
    seq_ds = SequenceDataset(chunk, 'TARGET','POLYLINE')
    seq_dl = DataLoader(seq_ds, batch_size=BATCH_SIZE, collate_fn=pad_sequence, num_workers=4, shuffle=False)
    train_loss = train(model, seq_dl, optimizer, criterion)    
    print(train_loss)



745.5295532226562
587.167236328125
512.1308959960937
403.3185119628906
439.5468017578125
434.0119140625
307.8872955322266
243.89559936523438
216.7292236328125
225.18474731445312
137.63194732666017
113.88238525390625
162.26631622314454
162.06320190429688
108.5855484008789
102.09848022460938
70.36513671875
64.49977416992188
73.20283966064453
97.33077087402344
83.84671783447266
87.09133911132812


Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/garethdavies/anaconda2/envs/fastai/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/Users/garethdavies/anaconda2/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/Users/garethdavies/anaconda2/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/Users/garethdavies/anaconda2/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/garethdavies/anaconda2/envs/fastai/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/Users/garethdavies/anaconda2/envs/fastai/lib/python3.6/multiprocessing/connection.py"

KeyboardInterrupt: 

In [211]:
torch.save(model, PATH + 'knn.pt')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [212]:
test = pd.read_csv(PATH + 'test.csv', converters={'POLYLINE': lambda x: json.loads(x)})
test['POLYLINE'] = test.apply(normalise, axis=1)
test.to_csv(PATH + 'test_sequence.csv')
test['TARGET'] = '[0,0]'
test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,TARGET
0,T1,B,,15.0,20000542,1408039037,A,False,"[[-0.11856344066899088, 0.5326072626635855], [...","[0,0]"
1,T2,B,,57.0,20000108,1408038611,A,False,"[[-0.15839039524964799, 0.09205020203651978], ...","[0,0]"
2,T3,B,,15.0,20000370,1408038568,A,False,"[[-0.11807774610091422, 0.5315058700120036], [...","[0,0]"
3,T4,B,,53.0,20000492,1408039090,A,False,"[[-0.21776655619460014, 0.038081962109691274],...","[0,0]"
4,T5,B,,18.0,20000621,1408039177,A,False,"[[-0.12512031733778625, -0.06576363075241587]]","[0,0]"


In [213]:
test_seq = test.copy()
test_seq = test_seq[['TARGET','POLYLINE']]

In [214]:
test_ds = SequenceDataset(test_seq, 'TARGET','POLYLINE')
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, collate_fn=pad_sequence, num_workers=1, shuffle=False)

print(len(test_ds))
print(len(test_dl))

model.eval()
pred_list = []
for sequence, target, seq_length in test_dl:
    last_pred = model(sequence, seq_length)
    probable_locations = last_pred*torch_clusters
    probable_locations = probable_locations.transpose(1,0)
    weighted_avg = probable_locations.sum(dim=2)
    pred_list.append(weighted_avg)
    
    
preds = torch.cat(pred_list,dim=0)
preds

318
2




tensor([[41.1937, -8.6076],
        [41.4080, -8.5176],
        [41.2360, -7.7537],
        [41.1922, -8.5916],
        [41.1671, -8.5982],
        [41.1819, -8.6786],
        [41.1630, -8.6462],
        [41.0698, -8.5802],
        [41.1591, -8.6402],
        [41.1543, -8.6031],
        [41.2052, -8.6364],
        [41.1590, -8.6652],
        [41.1583, -8.6293],
        [41.1711, -8.6253],
        [41.1699, -8.6471],
        [41.2563, -8.6876],
        [41.1734, -8.6066],
        [41.2045, -8.6502],
        [41.1810, -8.5775],
        [41.1719, -8.5897],
        [41.1671, -8.6570],
        [41.1500, -8.6038],
        [41.1890, -8.5934],
        [41.1527, -8.6483],
        [41.1697, -8.5738],
        [41.2351, -8.6694],
        [41.1730, -8.6599],
        [41.1513, -8.6249],
        [41.1475, -8.5858],
        [41.1773, -8.6871],
        [41.2228, -8.6623],
        [41.1571, -8.6255],
        [41.1755, -8.6613],
        [41.1572, -8.6286],
        [41.1441, -8.6038],
        [41.1659, -8

In [215]:
preds = preds.detach().numpy()

In [216]:
latitude = preds[:,0]
longitude = preds[:,1]

In [217]:
submission = test.copy()
blanks = submission[submission['POLYLINE'].map(lambda d: len(d)) == 0]
submission = test.copy()
nonblanks = submission[submission['POLYLINE'].map(lambda d: len(d)) > 0]

In [218]:
longitude.shape

(318,)

In [219]:
nonblanks['LATITUDE'] = latitude.tolist()
nonblanks['LONGITUDE'] = longitude.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [220]:
blanks['LATITUDE'] = lat_mean
blanks['LONGITUDE'] = long_mean
blanks

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,TARGET,LATITUDE,LONGITUDE
33,T34,C,,,20000129,1408039193,A,False,[],"[0,0]",41.15731,-8.616141
121,T127,A,27031.0,,20000591,1412065621,A,False,[],"[0,0]",41.15731,-8.616141


In [221]:
submission = pd.concat([blanks, nonblanks])
submission = submission[['TRIP_ID', 'LATITUDE', 'LONGITUDE']]
submission = submission.sort_index(axis=0)
submission.to_csv(PATH + 'test_submission.csv', index=False)
submission

Unnamed: 0,TRIP_ID,LATITUDE,LONGITUDE
0,T1,41.193680,-8.607610
1,T2,41.407967,-8.517629
2,T3,41.235992,-7.753664
3,T4,41.192196,-8.591563
4,T5,41.167091,-8.598186
5,T6,41.181942,-8.678622
6,T7,41.162994,-8.646189
7,T8,41.069767,-8.580234
8,T9,41.159058,-8.640213
9,T10,41.154259,-8.603063


In [28]:
arr

array([[41.154488, -8.630838],
       [41.17067 , -8.66574 ],
       [41.14053 , -8.61597 ],
       ...,
       [41.158756, -8.627454],
       [41.173523, -8.587026],
       [41.14988 , -8.620893]], dtype=float32)

In [106]:
arr_clusters.shape

(3455, 2)

In [159]:
x = [0,0,0.9,0.1]
x = torch.Tensor(x)
x = x[:,None]

y = [[2.1, -1.1],[5.1, -4.1],[3.1, -0.1],[2.1, -1.1]]
y= torch.Tensor(y)

In [160]:
z = x*y

In [161]:
z

tensor([[ 0.0000, -0.0000],
        [ 0.0000, -0.0000],
        [ 2.7900, -0.0900],
        [ 0.2100, -0.1100]])

In [162]:
torch.sum(z, dim=0)

tensor([ 3.0000, -0.2000])

In [120]:
t = torch.from_numpy(arr_clusters)

In [207]:
arr_clusters

array([[41.156216, -8.602068],
       [41.156197, -8.629091],
       [41.128174, -8.620415],
       ...,
       [41.039318, -8.579821],
       [41.24707 , -8.671032],
       [41.244434, -8.67218 ]], dtype=float32)

In [184]:
torch_clusters = torch.from_numpy(arr_clusters).transpose(1, 0)
torch_clusters = torch_clusters[:,None,:]

In [186]:
torch_clusters

tensor([[[41.1562, 41.1562, 41.1282,  ..., 41.0393, 41.2471, 41.2444]],

        [[-8.6021, -8.6291, -8.6204,  ..., -8.5798, -8.6710, -8.6722]]])

In [185]:
torch_clusters.size()

torch.Size([2, 1, 3455])

In [178]:
p = torch.randn(200,3455)

In [192]:
z = p*torch_clusters
z = z.transpose(1,0)
z = z.sum(dim=2)
z.size()
z

tensor([[ 6.8934e+02, -1.4326e+02],
        [ 2.0475e+03, -4.3042e+02],
        [ 6.2265e+02, -1.2454e+02],
        [-1.4497e+03,  3.0379e+02],
        [-6.5770e+01,  2.2140e+01],
        [-4.7916e+02,  9.9629e+01],
        [-4.0277e+03,  8.4315e+02],
        [-6.7619e+02,  1.3748e+02],
        [ 3.0047e+03, -6.2531e+02],
        [ 4.6786e+01, -6.7022e+00],
        [ 4.8381e+02, -1.0250e+02],
        [-5.5575e+02,  1.1833e+02],
        [ 5.0116e+02, -1.0655e+02],
        [ 2.5105e+03, -5.2482e+02],
        [-2.1037e+03,  4.3999e+02],
        [ 4.3317e+03, -9.0588e+02],
        [-4.5663e+03,  9.5332e+02],
        [-2.7796e+03,  5.8399e+02],
        [-4.6671e+00, -8.8321e-01],
        [ 3.3654e+02, -6.7556e+01],
        [ 4.5539e+03, -9.4621e+02],
        [ 4.3752e+02, -9.2560e+01],
        [-9.4544e+02,  1.9556e+02],
        [-2.3099e+03,  4.8694e+02],
        [ 2.0635e+03, -4.2289e+02],
        [ 1.9247e+03, -4.0416e+02],
        [ 6.0203e+01, -1.4184e+01],
        [-6.4491e+02,  1.286

In [None]:
200,2