In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import matplotlib.pyplot as plt

In [2]:
raw = pd.read_csv('../dat/schools_w_clusters.csv')
raw = raw[['Cluster ID', 'Id', 'Site name', 'Address', 'Zip', 'Phone']]
raw['Zip'] = raw['Zip'].astype(str)
raw['Phone'] = raw['Phone'].astype(str)
raw.head()

Unnamed: 0,Cluster ID,Id,Site name,Address,Zip,Phone
0,0,0,salvation army - temple / salvation army,1 n ogden ave,,2262649.0
1,0,1,salvation army - temple / salvation army,1 n ogden ave,,2262649.0
2,0,215,salvation army temple,1 n. ogden,,2262649.0
3,0,509,salvation army - temple / salvation army,1 n ogden ave,,2262649.0
4,0,510,salvation army - temple / salvation army,1 n ogden ave,,2262649.0


In [3]:
print('name max len =', raw['Site name'].str.len().max())
print('address max len =', raw['Address'].str.len().max())
print('Zip max len =', raw['Zip'].str.len().max())
print('phone max len =', raw['Phone'].str.len().max())

name max len = 95
address max len = 43
Zip max len = 7
phone max len = 9


for a total of max length 154

## defs
The following insanity is how we need to convert into a useable Torch tensor of correct size and Variable...ness.

In [4]:
Variable(torch.from_numpy(np.random.rand(10)).float()).view(1,10)

Variable containing:
 0.0555  0.6770  0.7945  0.2175  0.6378  0.7369  0.5262  0.1299  0.3192  0.6213
[torch.FloatTensor of size 1x10]

In [5]:
def extend_to_length(string_to_expand, length):
    extension = '~' * (length-len(string_to_expand))
    return string_to_expand + extension

def record_formatter(record):
    name = extend_to_length(record['Site name'], 95)
    addr = extend_to_length(record['Address'], 43)
    zipp = extend_to_length(record['Zip'], 7)
    phon = extend_to_length(record['Phone'], 9)
    
    strings = list(''.join((name, addr, zipp, phon)))
    characters = np.array(list(map(ord, strings)))
    
    return Variable(torch.from_numpy(characters).float()).view(1,len(characters))

In [10]:
class ae_net(nn.Module):
    def __init__(self, v_size=154, enc_size=50):
        super(ae_net, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(v_size, 400),
            nn.ReLU(True),
            nn.Linear(400, 200),
            nn.ReLU(True),
            nn.Linear(200, 100),
            nn.ReLU(True),
            nn.Linear(100, enc_size)
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(enc_size, 75),
            nn.ReLU(True),
            nn.Linear(75, 100),
            nn.ReLU(True),
            nn.Linear(100, 125),
            nn.ReLU(True),
            nn.Linear(125, v_size)
        )
        
    def autoencode(self, vector):
        return self.decoder(self.encoder(vector))
    
    def encode(self, vector):
        return self.encoder(vector)
        
class disc_net(nn.Module):
    def __init__(self, enc_size=50):
        super(disc_net, self).__init__()
        
        self.discriminator = nn.Sequential(
            nn.Linear(enc_size*2, 100),
            nn.ReLU(True),
            nn.Linear(100, 50),
            nn.ReLU(True),
            nn.Linear(50,2),
            nn.LogSoftmax()
        )

    def discriminate(self, input1, input2):
        output = self.discriminator(torch.cat([input1, input2], dim=1))
        return output
    

## data characteristics

In [7]:
raw.shape

(2693, 6)

In [8]:
raw['Cluster ID'].unique().shape

(740,)

## training

In [12]:
learning_rate = 0.001

model1 = ae_net()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model1.parameters(), lr=learning_rate)

In [13]:
%%time
ae_loss = []

model1.train()

# train autoencoder
for epoch in range(1):
    temp_loss = 0
    
    for i in range(raw.shape[0]):
        # build data pairs
        inpt = record_formatter(raw.iloc[i])

        # forward
        otpt = model1.autoencode(inpt)
        loss = criterion(otpt, inpt)
        
        # backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # logging
        temp_loss += loss

    # logging
    ae_loss.append(temp_loss.data[0]/raw.shape[0])
    
model1.eval()

CPU times: user 2min 31s, sys: 6.67 s, total: 2min 38s
Wall time: 23.1 s


In [14]:
learning_rate = 0.001

model2 = disc_net()
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model2.parameters(), lr=learning_rate)

In [18]:
%%time

disc_loss = []
diff = 1
model2.train()

# train discriminator
for epoch in range(1):
    temp_loss = 0
    
    for i in range(raw.shape[0]-diff):
        # build data pairs
        inpt1 = model1.encode(record_formatter(raw.iloc[i]))
        inpt2 = model1.encode(record_formatter(raw.iloc[i+diff]))
        label = 1 if (raw.iloc[i]['Cluster ID'] == raw.iloc[i+diff]['Cluster ID']) else 0
        label = Variable(torch.LongTensor([label]))
        
        # forward
        otpt = model2.discriminate(inpt1, inpt2)
        loss = criterion(otpt, label)
        
        # backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # logging
        temp_loss += loss

    # logging
    disc_loss.append(temp_loss.data[0]/raw.shape[0])
    
model2.eval()

CPU times: user 40.3 s, sys: 1min 29s, total: 2min 9s
Wall time: 16.8 s


Don't forget to Exp the output b/c LogSoftmax

In [16]:
torch.exp(model2.discriminate(inpt1, inpt2))

Variable containing:
 0.4712  0.5288
[torch.FloatTensor of size 1x2]

In [19]:
print('ae loss =', ae_loss)
print('disc ls =', disc_loss)

ae loss = [384.9202098031935]
disc ls = [0.5623177784417935]
