In [123]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable


In [31]:
raw = pd.read_csv('../dat/schools_w_clusters.csv')
raw = raw[['Cluster ID', 'Id', 'Site name', 'Address', 'Zip', 'Phone']]
raw['Zip'] = raw['Zip'].astype(str)
raw['Phone'] = raw['Phone'].astype(str)
raw.head()

Unnamed: 0,Cluster ID,Id,Site name,Address,Zip,Phone
0,0,0,salvation army - temple / salvation army,1 n ogden ave,,2262649.0
1,0,1,salvation army - temple / salvation army,1 n ogden ave,,2262649.0
2,0,215,salvation army temple,1 n. ogden,,2262649.0
3,0,509,salvation army - temple / salvation army,1 n ogden ave,,2262649.0
4,0,510,salvation army - temple / salvation army,1 n ogden ave,,2262649.0


In [32]:
print('name max len =', raw['Site name'].str.len().max())
print('address max len =', raw['Address'].str.len().max())
print('Zip max len =', raw['Zip'].str.len().max())
print('phone max len =', raw['Phone'].str.len().max())

name max len = 95
address max len = 43
Zip max len = 7
phone max len = 9


for a total of max length 154

The following insanity is how we need to convert into a useable Torch tensor of correct size and Variable...ness.

In [137]:
Variable(torch.from_numpy(np.random.rand(10)).float()).view(1,10)

Variable containing:
 0.8311  0.6418  0.2959  0.9816  0.4597  0.6524  0.2534  0.1469  0.5648  0.7182
[torch.FloatTensor of size 1x10]

In [166]:
def extend_to_length(string_to_expand, length):
    extension = '~' * (length-len(string_to_expand))
    return string_to_expand + extension

def record_formatter(record):
    name = extend_to_length(record['Site name'], 95)
    addr = extend_to_length(record['Address'], 43)
    zipp = extend_to_length(record['Zip'], 7)
    phon = extend_to_length(record['Phone'], 9)
    
    strings = list(''.join((name, addr, zipp, phon)))
    characters = np.array(list(map(ord, strings)))
    
    return Variable(torch.from_numpy(characters).float()).view(1,len(characters))

In [175]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.dense1 = nn.Sequential(
            nn.Linear(154,100),
            nn.ReLU(inplace=True),
            nn.Linear(100,50))

        self.fc1 = nn.Sequential(
            nn.Linear(50,40),
            nn.ReLU(inplace=True),
            nn.Linear(40, 10),
            nn.Linear(10, 2))

    def forward_once(self, x):
        output = self.dense1(x)
        output = output.view(output.size()[0], -1)
        output = self.fc1(output)
        return output

    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        return output1, output2
    
class ContrastiveLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, x0, x1, y):
        # euclidian distance
        diff = x0 - x1
        dist_sq = torch.sum(torch.pow(diff, 2), 1)
        dist = torch.sqrt(dist_sq)

        mdist = self.margin - dist
        dist = torch.clamp(mdist, min=0.0)
        loss = y * dist_sq + (1 - y) * torch.pow(dist, 2)
        loss = torch.sum(loss) / 2.0 / x0.size()[0]
        return loss

In [172]:
inpt1 = record_formatter(raw.iloc[0])
inpt2 = record_formatter(raw.iloc[1])

#print(inpt1)
#print(inpt2)

In [173]:
model = SiameseNetwork()

In [174]:
model.forward(inpt1, inpt2)

(Variable containing:
 -1.5019  0.5559
 [torch.FloatTensor of size 1x2], Variable containing:
 -1.5019  0.5559
 [torch.FloatTensor of size 1x2])

In [176]:
loss = ContrastiveLoss()

In [177]:
loss.forward(inpt1, inpt2, 1)

Variable containing:
 0
[torch.FloatTensor of size 1]

In [192]:
inpt3 = record_formatter(raw.iloc[2])

loss.forward(inpt1, inpt3, 1)

Variable containing:
 31255
[torch.FloatTensor of size 1]

In [193]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)