# GJ Project: Training the Siamese Network

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from scipy.spatial import distance
import torch.nn as nn
import torchvision.datasets as dsets
from torch.autograd import Variable
import gjnn.model
import gjnn.loss
import gjnn.dataloader


# Dataset Loading 
dataset = pd.read_csv("data/ds_medium.csv", sep=None, engine='python',  dtype={'user_id_1': "category", "user_id_2":"category"})


dataset.drop(["ifp_id"], axis =1, inplace = True)


In [None]:
dataset.head()

# To modify when dataset column order will change
features_user_1 = [0,1,2,3,4,5,6,7,8,9,10,11,15]
features_user_2 = [0,1,2,3,4,5,16,17,18,19,20,21,22]
print(dataset.columns[[0,1,2,3,4,5,6,7,8,9,10,11,15]])
print(dataset.columns[[0,1,2,3,4,5,16,17,18,19,20,21,22]])

print(dataset.iloc[:, features_user_1])
print(dataset.iloc[:, features_user_2])
user_1 = dataset.iloc[:, features_user_1]
user_2 = dataset.iloc[:, features_user_2]

print(len(dataset))

In [None]:
user_1_dist = user_1["distance_1"]
user_1_dist

In [None]:
dataset = dataset.apply(pd.to_numeric)



# The current split is 95% of data is used for training and 5% for validation of the model
train = dataset.sample(frac=0.95,random_state=200)
test = dataset.drop(train.index)
#train = train.as_matrix()
#test = test.as_matrix()






#train_loader = torch.utils.data.DataLoader(dataset=train, batch_size=batch_size, shuffle=True)
#test_loader = torch.utils.data.DataLoader(dataset=test, batch_size=batch_size, shuffle=False)

# Option 1: Loading the dataset, we have no output labels in the classical sense
#train = torch.utils.data.TensorDataset(torch.Tensor(train).float())
#test = torch.utils.data.TensorDataset(torch.Tensor(test).float())
# Option 2: Custom DataSet
train = gjnn.dataloader.Dataset(train) 
test = gjnn.dataloader.Dataset(test)

In [None]:
batch_size = 64
n_iters = 50000
num_epochs = n_iters / (len(train) / batch_size)
num_epochs = int(num_epochs)
print("The number of epochs is: " + str(num_epochs))

In [None]:


train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)





print("After data loading")

# Setting other neural network hyperparameters
hidden_layer_size = 20
siamese_layer_size = 20
output_layer_size = 1
num_features_per_branch = 13
lr = 0.01
momentum = 0.9
num_epoch = 5



# Check dimensions of features
model = gjnn.model.SiameseNetwork(num_features_per_branch, siamese_layer_size, hidden_layer_size, output_layer_size)
print("Model correctly initialized...")

# Initialization of the Loss Function
criterion = gjnn.loss.DistanceLoss()
print("Distance Loss Correctly Initialized...")

# At the moment we stick to a classic SGD algorithm, maybe we can change it to Adam
#optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
print("Optimizer Instantiated...")


iter = 0

# TEST ON A BATCH OF THE DATASET
for i, (user1, user2, user1_dist, user2_dist) in enumerate(train_loader):
    print(i)
    print(user1, user2, user1_dist, user2_dist)

In [None]:
losses = []
for epoch in range(num_epochs):
    print("Epoch " + str(epoch))
    print(train_loader)
    for i, (user_1, user_2, user_1_dist, user_2_dist) in enumerate(train_loader):
        #features_u1 = Variable(user_1.view(-1, num_features))
        #features_u2 = Variable(user_2.view(-1, num_features))
        features_u1 = Variable(user_1)
        features_u2 = Variable(user_2)
        dist_u1 = Variable(user_1_dist)
        dist_u2 = Variable(user_2_dist)
        
        optimizer.zero_grad()
        
        # Here we have to give data which goes to branch 1 and data who goes on branch 2
        outputs = model(features_u1, features_u2)
        
        loss = criterion(user_1_dist, user_2_dist, outputs)
        losses.append(loss)
        print("loss for i {} is equal to: {}".format(i, loss))
        
        loss.backward()
        
        optimizer.step()
        
        iter += 1
        print(iter)
        # we want to check the accuracy with test dataset every 500 iterations
        # we can change this number, it is just if it is too small we lose a lot of time
        # checking accuracy while if it is big, we have less answers but takes less time for the algorithm
        #if iter % 500 == 0:
            # calculate accuracy
        #    correct = 0
        #    total = 0
            
            # iterate through test dataset
         #   for features, labels in test_loader:
         #       features = Variable(features.view(-1, num_features))
                
         #      outputs = model(features)
                # get predictions from the maximum value
         #       _, predicted = torch.max(outputs.data, 1)
                
                # total number of labels
         #       total += labels.size(0)
                
         #       correct += (predicted == labels).sum()
            
         #   accuracy = 100 * correct / total
            
         #   print("Iteration: {}. Loss: {}. Accuracy: {}".format(iter, loss.data[0], accuracy))

In [None]:
for i in losses:
    print(i)