In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import torch
!pip install tqdm
from tqdm import tqdm
import torch.nn as nn



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv("processed_train.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,TRIP_ID,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,POLYLINE,LEN,MON,DAY,HR,WK,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C,DAY_TYPE_A,DAY_TYPE_B,DAY_TYPE_C
0,0,1372636858620000589,0.0,0.0,20000589,1372636858,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330,7,1,0,0,0,0,1,1,0,0
1,1,1372637303620000596,0.0,7.0,20000596,1372637303,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270,7,1,0,0,0,1,0,1,0,0
2,2,1372636951620000320,0.0,0.0,20000320,1372636951,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960,7,1,0,0,0,0,1,1,0,0
3,3,1372636854620000520,0.0,0.0,20000520,1372636854,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630,7,1,0,0,0,0,1,1,0,0
4,4,1372637091620000337,0.0,0.0,20000337,1372637091,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420,7,1,0,0,0,0,1,1,0,0


In [5]:
#Removing these for this MLP model only
#df = df.drop(columns=["Unnamed: 0", "TRIP_ID", "ORIGIN_CALL", "TAXI_ID", "TIMESTAMP", "POLYLINE"])
df = df.drop(columns=["Unnamed: 0", "TRIP_ID", "ORIGIN_CALL", "ORIGIN_STAND", "TAXI_ID", "TIMESTAMP"])
#df = df.drop(columns=["Unnamed: 0", "ORIGIN_CALL", "ORIGIN_STAND", "TAXI_ID", "TIMESTAMP", "YR_2014", "YR"])

In [6]:
df_sample = df#.sample(frac=0.5)

In [7]:
df_len = df["LEN"]
df_sample = df_sample.drop(columns=["LEN"])

In [8]:
len(df_sample)

855330

In [9]:
df_sample.head()

Unnamed: 0,ORIGIN_STAND,MON,DAY,HR,WK,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C,DAY_TYPE_A,DAY_TYPE_B,DAY_TYPE_C
1425543,57.0,5,6,8,1,0,1,0,1,0,0
1080244,15.0,2,21,0,4,0,1,0,1,0,0
481169,21.0,10,12,20,5,0,1,0,1,0,0
826555,36.0,12,24,11,1,0,1,0,1,0,0
1637544,10.0,6,16,12,0,0,1,0,1,0,0


In [10]:
features = [df_sample.iloc[i].values for i in range(len(df_sample))]

In [11]:
x = torch.tensor(np.array(features), dtype = torch.float).to(device)
y = torch.tensor(np.array(df_len.values), dtype = torch.float).to(device)

data = list(zip(x, y))
train_size = int(0.8 * len(data))
val_size = int(0.1 * len(data))
test_size = len(data) - train_size - val_size

train_data, test_data, val_data = torch.utils.data.random_split(data, [train_size, test_size, val_size])

In [12]:
batch_size = 5
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle = True)
valid_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle = False)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle = False)
# train_loader = train_data
# valid_loader = val_data
# test_loader = test_data

In [13]:
len(train_data), len(test_data), len(val_data)

(684264, 85533, 85533)

In [14]:
learning_rate = 1e-5

In [15]:
model = torch.nn.Sequential(
    #torch.nn.Embedding(11, 11),
    torch.nn.Linear(11, 1024),
    torch.nn.ReLU(),
    torch.nn.Linear(1024, 1024),
    torch.nn.ReLU(),
    torch.nn.Linear(1024, 1024),
    torch.nn.ReLU(),
    torch.nn.Linear(1024, 1024),
    torch.nn.ReLU(),
    torch.nn.Linear(1024, 1024),
    torch.nn.ReLU(),
    torch.nn.Linear(1024, 1),
).to(device)

In [16]:
loss_fn = torch.nn.MSELoss().to(device)
#optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
#optimizer = torch.optim.Adagrad(model.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

In [17]:
from tqdm import trange
def train_epoch(train_data, model, optimizer, loss_fn, batch_size):
    losses = []
    train_len = len(train_data)

    # get a batch of training data
    for x, y in tqdm(train_data):
        # make predictions for this batch
        y_pred = model(x)
        
        # Compute the RSME loss
        loss = torch.sqrt(loss_fn(y_pred, y))
        
        # Backpropagation
        # zero out the gradients so that it will not accumulate through each iteration
        optimizer.zero_grad()
        
        # Compute the gradents with the backward call (backprop)
        loss.backward()
        
        # Update weight using gradient descent 
        optimizer.step()
        
        losses.append(loss.item())

    return np.mean(losses)

def eval_epoch(valid_loader, model, loss_function):
    total = 0
    correct = 0
    preds = []
    trues = []
    losses = []
    with torch.no_grad(): 
        for x, y in tqdm(valid_loader):
            # Compute prediction
            y_pred = model(x)
            
            loss = torch.sqrt(loss_fn(y_pred, y))
            losses.append(loss.item())
#             total += 1
#             correct += ((y <= t+10) and (y >= t-10))
#             #trues.append(t)
#             preds.append(y)
            
    return np.mean(losses)#float(correct/total*100), preds
train_losses = []
val_losses = []

In [18]:
total_epochs = 5
train_accs, valid_accs = [], []
max_acc = 0
val_loss = 0
print(f"Training on {len(train_data)} data with batch size of {batch_size}!")
for epoch in range(total_epochs):
    
    model.train() # gradient tracking is on
    
    train_loss = train_epoch(train_loader, model, optimizer, loss_fn, batch_size)
    
    train_losses.append(train_loss)
    
    model.eval()
    val_loss = eval_epoch(valid_loader, model, loss_fn)
    val_losses.append(val_loss)
    scheduler.step()
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss:>0.4f}, Validation Loss {val_loss:>0.4f}\n")

Training on 684264 data with batch size of 5!


  0%|          | 0/136853 [00:00<?, ?it/s]


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)

In [None]:
model.eval()
eval_epoch(test_loader, model, loss_fn)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.figure()
e = len(train_losses)
x_axis = np.arange(1, e + 1, 1)
plt.plot(x_axis, train_losses, label = "Train")
plt.plot(x_axis, val_losses, label = "Validation")
plt.legend()
plt.title('Training losses')
plt.xlabel("Epochs")
plt.show()

In [None]:
df_sample.head()

In [None]:
# count = 0
# loss_dict = {}
# with torch.no_grad(): 
#     for index, row in df_sample.iterrows():
#         x = torch.tensor(row.values, dtype=torch.float).to(device)
#         y_pred = model(x)

#         loss = torch.sqrt(loss_fn(y_pred, y[count]))
#         count += 1

#         loss_dict[index] = loss

In [None]:
# indexes = list(sorted(loss_dict))
# indexes[-11:-1]

In [None]:
df_orig = pd.read_csv("processed_train.csv")

In [None]:
torch.save(model, "MLP3")

In [None]:
df_test = pd.read_csv("processed_test.csv")
df_out = df_test["TRIP_ID"].to_frame()
df_test = df_test.drop(columns=["Unnamed: 0", "TRIP_ID", "ORIGIN_CALL", "ORIGIN_STAND", "TAXI_ID", "TIMESTAMP"])

In [None]:
x_test = [torch.tensor(df_test.iloc[i].values,dtype=torch.float).to(device) for i in range(len(df_test))]

In [None]:
model.eval()
with torch.no_grad(): 
    df_out["TRAVEL_TIME"] = [float(model(x).cpu()) for x in x_test]
df_out.head()
# # mean(716.43) -> 792.73593
# # median(600) -> 784.74219
df_out.to_csv("my_pred.csv", index=None)

In [None]:
#TODO Implement accuracy or smthing for training
#Do something with validation (make a validation set)