In [1]:
#  ! pip install --quiet seaborn toolz fuzzywuzzy
#  ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [2]:
import time
import math
import random
import json
from collections import Counter

import numpy as np
import pandas as pd

from gcsfs import GCSFileSystem

import torch
import torch.nn as nn
from torch import optim
#import torchtext

In [3]:
from src.model import InnerProductSimilarity, MarginRankingLoss, StarSpace

In [4]:
SAMPLE_SIZE = 100000
SOC_LEVEL = 2
BUBBLE_UP = 2

In [5]:
def get_indeed_texts(path, use_gcs = False, **kwargs):
    """Reads csv with indeed data that turns into test set"""
    if use_gcs:
        fs = GCSFileSystem(project='labor-market-data')
        path = path.replace('..','lmd-classify-dot',1)
        with fs.open(path) as f:
            indeed = pd.read_csv(f, **kwargs)
    else:
        indeed = pd.read_csv(path, **kwargs)

    indeed['title'] = indeed.title.str.lower()
    return indeed

def indeed_test_data(texts, lim, soc_n, use_gcs = False):
    """Make test data from indeed (pre-embedded)"""
    indeed = get_indeed_texts(texts, use_gcs, nrows=lim)
    matcher = make_matcher()
    matches = matcher(indeed.reset_index()).set_index('index')
    return matches.content, get_soc_n(matches.code, soc_n), matches.index

In [6]:
train = get_indeed_texts('../data/us/everything.csv',use_gcs=True,nrows=1000)
train = train['content']
train.head()

0    part time temporary do you have or know someon...
1    40 000 46 000 year lead electrician minimum ye...
2    front desk position chiropractic office monday...
3    110 000 130 000 year job title sec reporting m...
4    internship avakas is unique place where ideas ...
Name: content, dtype: object

In [7]:
from collections import Counter

def build_vocab(train, min_ct = 2):
    ''' build vocabulary for an array/list/series of text '''
    # To do: smaller groups before aggregating to improve performance
    def wordcount_df(doc):
        tok = doc.split()
        d = pd.DataFrame.from_dict(Counter(tok),orient='index').reset_index().rename(columns={'index':'word'})
        return d

    d_list = [wordcount_df(x) for x in train]

    d = pd.concat(d_list,axis=0)

    d = d.groupby(['word'])[0].sum().sort_values(ascending=False)
    d = d[d >= min_ct]
    
    d = dict(zip(d.index.values, range(len(d))))
    
    return d

In [8]:
train_vocab = build_vocab(train)

In [9]:
# train_iter, val_iter = data.BucketIterator.splits(
#     (train, validation), batch_size=batch_size, device=gpu)

In [10]:
model = StarSpace(
    d_embed=100,
    vocabulary=train_vocab,
    n_input=len(train_vocab),
    similarity=InnerProductSimilarity(),
    max_norm=20,
    aggregate=torch.sum)

In [11]:
lr = .01
criterion = MarginRankingLoss(margin=1., aggregate=torch.mean)
opt = torch.optim.Adam(model.parameters(), lr=lr)

### Begin the training loop

In [None]:
batch = train[0:100]
validation = train[100:150]
batch_size = 100

In [None]:
model.train()
opt.zero_grad()

In [None]:
lhs = batch.values

In [None]:
input_embedding = nn.Embedding(num_embeddings=len(train_vocab), embedding_dim = 100, max_norm=20)

In [None]:
## negative similarity
k = 3

In [None]:
l_batch = torch.stack(l_batch)
r_batch = torch.stack(r_batch)
neg_batch = torch.stack(neg_batch)

l_batch = l_batch.unsqueeze(1)
r_batch = r_batch.unsqueeze(1)

In [None]:
positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1)) #this is the same as dot product by row

In [None]:
negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

In [None]:
loss = torch.mean(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))
loss

In [None]:
loss.backward(); opt.step()

### Calculate accuracy of predictions in current batch

In [None]:
candidate_rhs = torch.autograd.Variable(torch.arange(0, 100).long().expand(100, -1))

### Old stuff

In [None]:
# def embed_doc(d,vocab,embedding,normalize=False):
#     positions = []
#     for t in d:
#         try:
#             positions.append(vocab[t])
#         except KeyError:
#             pass
#     output = torch.sum(embedding(torch.LongTensor(positions)),dim=0)
#     if normalize:
#         output = output / output.norm()
#     return output

In [None]:
# # similarity
# l_batch = []
# r_batch = []
# neg_batch = []

# for i in range(len(batch)):
#     #Positive similarity
#     s = batch.values[i].split('\t') #sentences
#     if type(s) == str: #only one sentence in s
#         a = s
#         b = s
#     else:
#         a, b = np.random.choice(s, 2, False)
    
#     a = a.split()
#     b = b.split()
    
#     a_emb = embed_doc(a,train_vocab,input_embedding,normalize=True)
#     b_emb = embed_doc(b,train_vocab,input_embedding,normalize=True)
    
#     l_batch.append(a_emb)
#     r_batch.append(b_emb)

#     #Negative similarity
#     negs = []
#     for _i in range(k * 3):
#         index = np.random.choice(len(batch))
#         if not index == i: #if it's not from the same document
#             c = batch.values[index].split('\t')
#             c = np.random.choice(c, 1)[0].split()
#             c_emb = embed_doc(c,train_vocab,input_embedding,normalize=True)
#             negs.append(c_emb)
#             if(len(negs) >= k):
#                 break
    
#     neg_batch.append(torch.stack(negs))