In [1]:
#  ! pip install --quiet seaborn toolz fuzzywuzzy
#  ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [2]:
import time
import logging
import math
import random
import json
from collections import Counter

import numpy as np
import pandas as pd

from gcsfs import GCSFileSystem

import torch
import torch.nn as nn
from torch import optim
#import torchtext

In [3]:
from src.model import InnerProductSimilarity, MarginRankingLoss, StarSpace

In [4]:
logger = logging.getLogger(__name__)

In [5]:
logging.basicConfig(filename='bcohen1.log',format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S %p')

In [6]:
SAMPLE_SIZE = 100000
SOC_LEVEL = 2
BUBBLE_UP = 2

In [7]:
def get_indeed_texts(path, use_gcs = False, **kwargs):
    """Reads csv with indeed data that turns into test set"""
    if use_gcs:
        fs = GCSFileSystem(project='labor-market-data')
        path = path.replace('..','lmd-classify-dot',1)
        with fs.open(path) as f:
            indeed = pd.read_csv(f, **kwargs)
    else:
        indeed = pd.read_csv(path, **kwargs)

    indeed['title'] = indeed.title.str.lower()
    return indeed

def indeed_test_data(texts, lim, soc_n, use_gcs = False):
    """Make test data from indeed (pre-embedded)"""
    indeed = get_indeed_texts(texts, use_gcs, nrows=lim)
    matcher = make_matcher()
    matches = matcher(indeed.reset_index()).set_index('index')
    return matches.content, get_soc_n(matches.code, soc_n), matches.index

In [8]:
indeed = get_indeed_texts('../data/us/everything.csv',use_gcs=True,nrows=SAMPLE_SIZE)
indeed = indeed['content']

In [9]:
idx = random.sample(range(len(indeed)),int(.8*len(indeed)))

In [10]:
train = indeed[idx]
train.reset_index(drop=True,inplace=True)

valid = indeed.drop(idx)
valid.reset_index(drop=True,inplace=True)

train.head()

0    description of duties the client services spec...
1    part time must be graduate of an accredited pr...
2    part time personal care assistants grow with u...
3    scientist building product formulations lookin...
4    qualified applicants must be at least 21 years...
Name: content, dtype: object

In [11]:
from collections import Counter

def build_vocab(train, min_ct = 2):
    ''' build vocabulary for an array/list/series of text '''
    # To do: smaller groups before aggregating to improve performance
    def wordcount_df(doc):
        tok = doc.split()
        d = pd.DataFrame.from_dict(Counter(tok),orient='index').reset_index().rename(columns={'index':'word'})
        return d

    d_list = [wordcount_df(x) for x in train]

    d = pd.concat(d_list,axis=0)

    d = d.groupby(['word'])[0].sum().sort_values(ascending=False)
    d = d[d >= min_ct]
    
    d = dict(zip(d.index.values, range(len(d))))
    
    return d

In [12]:
lr = .01

train_vocab = build_vocab(indeed)

pd.DataFrame.from_dict(train_vocab,orient='index').to_csv('train_vocab_%s.csv' % SAMPLE_SIZE)

model = StarSpace(
    d_embed=100,
    vocabulary=train_vocab,
    n_input=len(train_vocab),
    k_neg = 10,
    max_norm=20)

opt = torch.optim.Adam(model.parameters(), lr=lr)

batch_size = 100
#for epoch in range(epochs):
#shuffle order of training data and validation data

epochs = 5
iterations = 0
start = time.time()
best_val_acc = -1
print_every = 100
losses = []

In [13]:
for epoch in range(epochs):
    permutation = torch.randperm(len(train)).numpy()

    for i in range(0,len(train), batch_size):
        indices = permutation[i:i+batch_size]
        batch = train[indices]

        model.train(); opt.zero_grad()

        lhs = batch.values

        l_batch, r_batch, neg_batch = model(lhs)

        positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1)) #this is the same as dot product by row

        negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

        loss = torch.sum(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))
        losses.append(loss.detach().numpy())

        loss.backward(); opt.step()

        accuracy_check = positive_similarity.squeeze(1) > negative_similarity[:,0].unsqueeze(1)
        acc = np.mean(accuracy_check.detach().numpy())

        if i % (print_every*batch_size) == 0:
            print(loss)
    
    print('Finished epoch %s at %s.' % (epoch,time.ctime()))

tensor(1770820.2500, grad_fn=<SumBackward0>)
tensor(832375.3750, grad_fn=<SumBackward0>)
tensor(52969.1641, grad_fn=<SumBackward0>)
tensor(117691.8828, grad_fn=<SumBackward0>)
tensor(194385.6406, grad_fn=<SumBackward0>)
tensor(166289.3438, grad_fn=<SumBackward0>)
tensor(59307.4297, grad_fn=<SumBackward0>)
tensor(3915.5503, grad_fn=<SumBackward0>)
Finished epoch 0 at Mon Jun  8 09:41:11 2020.
tensor(31230.6992, grad_fn=<SumBackward0>)
tensor(689107.1875, grad_fn=<SumBackward0>)
tensor(41807.2656, grad_fn=<SumBackward0>)
tensor(111490.8984, grad_fn=<SumBackward0>)
tensor(15529.7617, grad_fn=<SumBackward0>)
tensor(86387.2109, grad_fn=<SumBackward0>)
tensor(290320.3438, grad_fn=<SumBackward0>)
tensor(42863.6562, grad_fn=<SumBackward0>)
Finished epoch 1 at Mon Jun  8 10:44:40 2020.
tensor(46118.9414, grad_fn=<SumBackward0>)
tensor(3284.4058, grad_fn=<SumBackward0>)
tensor(3693.9802, grad_fn=<SumBackward0>)
tensor(26267.3203, grad_fn=<SumBackward0>)
tensor(31527.9551, grad_fn=<SumBackward0>)

In [14]:
#Save the weights to CSV
weights = model.input_embedding.weight
weights = weights.data.detach().numpy()
np.savetxt("weights_%s.csv" % SAMPLE_SIZE, weights, delimiter=",")

In [None]:
#Validate

In [15]:
model.eval()
val_acc = 0

In [16]:
for j in range(0,len(valid), batch_size):
    valperm = torch.randperm(len(valid)).numpy()
    val_indices = valperm[j:j+batch_size]
    val_batch = valid[val_indices]

    val_lhs = val_batch.values

    val_l_batch, val_r_batch, val_neg_batch = model(val_lhs)

    val_positive_similarity = torch.bmm(val_l_batch,val_r_batch.transpose(2,1))
    val_negative_similarity = torch.bmm(val_l_batch, val_neg_batch.transpose(2,1)).squeeze(1)

    val_loss = torch.sum(torch.clamp(.1 - val_positive_similarity + val_negative_similarity, min=0))
    if j % (print_every*batch_size) == 0:
        print(val_loss)
    
    val_accuracy_check = val_positive_similarity.squeeze(1) > val_negative_similarity[:,0].unsqueeze(1)
    val_acc += np.sum(val_accuracy_check.detach().numpy())

tensor(5317.2021, grad_fn=<SumBackward0>)
tensor(1226.6842, grad_fn=<SumBackward0>)


In [17]:
print(val_acc/len(valid))

0.99935


### Old stuff

### Begin the training loop

In [None]:
# batch = train[0:100]
# validation = train[100:150]
# batch_size = 100

# model.train()
# opt.zero_grad()

# lhs = batch.values

# l_batch, r_batch, neg_batch = model(lhs)

# positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1)) #this is the same as dot product by row

# negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

# loss = torch.mean(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))
# loss

# loss.backward(); opt.step()

### Calculate accuracy of predictions in current batch

In [None]:
accuracy_check = positive_similarity.squeeze(1) > negative_similarity[:,0].unsqueeze(1)
acc = np.mean(similarity_check.detach().numpy())
print(acc)

In [None]:
# def embed_doc(d,vocab,embedding,normalize=False):
#     positions = []
#     for t in d:
#         try:
#             positions.append(vocab[t])
#         except KeyError:
#             pass
#     output = torch.sum(embedding(torch.LongTensor(positions)),dim=0)
#     if normalize:
#         output = output / output.norm()
#     return output

In [None]:
# # similarity
# l_batch = []
# r_batch = []
# neg_batch = []

# for i in range(len(batch)):
#     #Positive similarity
#     s = batch.values[i].split('\t') #sentences
#     if type(s) == str: #only one sentence in s
#         a = s
#         b = s
#     else:
#         a, b = np.random.choice(s, 2, False)
    
#     a = a.split()
#     b = b.split()
    
#     a_emb = embed_doc(a,train_vocab,input_embedding,normalize=True)
#     b_emb = embed_doc(b,train_vocab,input_embedding,normalize=True)
    
#     l_batch.append(a_emb)
#     r_batch.append(b_emb)

#     #Negative similarity
#     negs = []
#     for _i in range(k * 3):
#         index = np.random.choice(len(batch))
#         if not index == i: #if it's not from the same document
#             c = batch.values[index].split('\t')
#             c = np.random.choice(c, 1)[0].split()
#             c_emb = embed_doc(c,train_vocab,input_embedding,normalize=True)
#             negs.append(c_emb)
#             if(len(negs) >= k):
#                 break
    
#     neg_batch.append(torch.stack(negs))