In [1]:
# ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [2]:
# ! pip install --quiet fuzzywuzzy gcsfs

In [3]:
import time
import sys
import logging
import math
import random
import json
import pickle
from collections import Counter

import numpy as np
import pandas as pd

from gcsfs import GCSFileSystem
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import davies_bouldin_score

import torch
import torch.nn as nn
from torch import optim
#import torchtext

from validation.data import *



In [4]:
from src.model import StarSpace

In [5]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print("Using device", device)

Using device cuda


In [6]:
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", 
                          datefmt="%Y-%m-%d - %H:%M:%S")
fh = logging.FileHandler("separation_model.log", "w")
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [7]:
SAMPLE_SIZE = 200000
SOC_LEVEL = 3
OUTPUT_WEIGHTS = 'data/separation/'

In [8]:
log.info('Pulling Indeed data for sample size %s' % SAMPLE_SIZE)

In [9]:
# Get job ads data
indeed = get_indeed_texts('../data/us/everything.csv',use_gcs=True,nrows=SAMPLE_SIZE)
indeed = indeed['content']

In [10]:
train = indeed.copy()
del indeed

In [11]:
# Get DOT classifications data
dot, dot_labs = dot_train_data(SOC_LEVEL)

dot.reset_index(drop=True,inplace=True)
dot_labs.reset_index(drop=True,inplace=True)

In [12]:
log.info('About to train vocab')

In [13]:
Vectorizer = CountVectorizer(min_df = 10,
                             max_df = .95)
Vectorizer.fit(train)

train_vocab = Vectorizer.vocabulary_

In [14]:
len(train_vocab)

8878

In [15]:
log.info('Trained Vocab of size %s' % str(len(train_vocab)))

In [16]:
# #Save the file
with open(OUTPUT_WEIGHTS + 'train_vocab_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(train_vocab, f)

In [17]:
model = StarSpace(
    d_embed=100,
    vocabulary=train_vocab,
    k_neg = 10,
    max_norm=20)

model.to(device)

lr = .01
opt = torch.optim.Adam(model.parameters(), lr=lr)

In [18]:
train_pos = model.get_positions(train)
dot_positions = model.get_positions(dot)

In [19]:
epochs = 3
print_every = 10
log_every = 10
batch_size = 100

losses = []
separation_losses = []
epoch_losses = [1e10]
log.info('Beginning run')

In [20]:
for i in range(len(train_pos)):
    for j in range(len(train_pos[i])):
        train_pos[i][j] = train_pos[i][j].to(device)

In [21]:
for i in range(len(dot_positions)):
    for j in range(len(dot_positions[i])):
        dot_positions[i][j] = dot_positions[i][j].to(device)

In [22]:
def sep_factor(starspace_loss):
#     if star_loss > 5_000_000:
#         return 1_000_000
#     else:
#         return 100_000 
    return 1_000_000

In [23]:
# epoch = 0
# permutation = torch.randperm(len(train_pos)).numpy()
# i = 0
# indices = permutation[i:i+batch_size]
# batch = train_pos[indices]

# model.train(); opt.zero_grad()

# #l_batch, r_batch, neg_batch = model(batch)
# a = batch[0][1]

# model.embed_doc(a.to(device))

In [24]:
for epoch in range(epochs):
    permutation = torch.randperm(len(train_pos)).numpy()
    nan_break = False
    
    for i in range(0,len(train), batch_size):
        indices = permutation[i:i+batch_size]
        batch = train_pos[indices]

        model.train(); opt.zero_grad()

        l_batch, r_batch, neg_batch = model(batch)
        
        # nan tests...
        l_test = np.isnan(np.mean(l_batch.detach().cpu().numpy()))
        r_test = np.isnan(np.mean(r_batch.detach().cpu().numpy()))
        neg_test = np.isnan(np.mean(neg_batch.detach().cpu().numpy()))
        if l_test or r_test or neg_test:
            nan_break = True
            break
        
        positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1)) #this is the same as dot product by row
        negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

        star_loss = torch.sum(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))
        
        # Now add in clustering loss for DOT categories
        idx = np.random.choice(len(dot_positions),5000)
        dot_sample = dot_positions[idx]
        dot_y_sample = dot_labs[idx]
        
        dot_emb = torch.LongTensor(np.empty([dot_sample.shape[0],100])).to(device)
        for j,doc in enumerate(dot_sample):
            doc_flat = torch.cat([torch.unsqueeze(z,0) for z in doc],1).squeeze(0).to(device)
            dot_emb[j] = model.embed_doc(doc_flat)

        separation_loss = torch.tensor(davies_bouldin_score(dot_emb.cpu(),dot_y_sample) * 1000000).to(device)
        loss = star_loss + separation_loss

        loss.backward();opt.step()

        losses.append(star_loss.detach().cpu().numpy())
        separation_losses.append(separation_loss.detach().cpu().numpy())

        if i % (print_every*batch_size) == 0:
            print('separation avg loss: %s' % str(np.mean(separation_losses[-10:])))
            print('star avg loss: %s' % str(np.mean(losses[-10:])))
        if i % (log_every*batch_size) == 0:
            log.info('separation avg loss: %s' % str(np.mean(separation_losses[-10:])))
            log.info('star avg loss: %s' % str(np.mean(losses[-10:])))
    
    # End of inner loop
    if nan_break:
        print("you've got nans")
        log.warning("you've got nans")
        break
    
    print('Finished epoch %s at %s.' % (epoch,time.ctime()))
    log.info("Finished epoch %s" % str(epoch))
    
    epoch_loss = np.mean(losses[(len(losses)-100):])
    
    if epoch_loss < min(epoch_losses):
        print('best epoch so far!')
        log.info('best epoch so far!')
        
        weights = model.embeddings.weight
        with open(OUTPUT_WEIGHTS + 'weights_best_epoch', 'wb') as f:
            pickle.dump(weights.data.detach().cpu().numpy(), f)
    
    epoch_losses.append(epoch_loss)
    print(epoch_losses)

separation avg loss: 6362938.289015444
star avg loss: 22237188.0
separation avg loss: 6369487.699363755
star avg loss: 16491466.0
separation avg loss: 6369945.616542241
star avg loss: 13584483.0
separation avg loss: 6409743.59956676
star avg loss: 11299324.0
separation avg loss: 6346340.075664212
star avg loss: 11431326.0
separation avg loss: 6391785.201242564
star avg loss: 10624006.0
separation avg loss: 6388956.632713327
star avg loss: 11254690.0
separation avg loss: 6351273.723596376
star avg loss: 9996808.0
separation avg loss: 6345612.705954781
star avg loss: 10264275.0
separation avg loss: 6293281.877364844
star avg loss: 9545937.0
Finished epoch 0 at Fri Jun 19 10:42:13 2020.
best epoch so far!
[10000000000.0, 11546273.0]
separation avg loss: 6346944.077791495
star avg loss: 9702220.0
separation avg loss: 6295201.589704851
star avg loss: 8875393.0
separation avg loss: 6280363.556186403
star avg loss: 8008437.5
separation avg loss: 6360915.895576321
star avg loss: 8726323.0
sepa

In [25]:
# print(epoch_losses)

In [26]:
weights = model.embeddings.weight
with open(OUTPUT_WEIGHTS + 'weights_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(weights.data.detach().cpu().numpy(), f)

TypeError: can't convert non-cpu tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [None]:
print('You made it!')
log.info('You made it!')

In [None]:
# #Save the weights to CSV
# weights = model.input_embedding.weight
# weights = weights.data.detach().numpy()
# np.savetxt("weights_%s.csv" % SAMPLE_SIZE, weights, delimiter=",")

### Old stuff

In [None]:
#Validate

In [None]:
# model.eval()
# val_acc = 0

In [None]:
# for j in range(0,len(valid), batch_size):
#     valperm = torch.randperm(len(valid)).numpy()
#     val_indices = valperm[j:j+batch_size]
#     val_batch = valid[val_indices]

#     val_lhs = val_batch.values

#     val_l_batch, val_r_batch, val_neg_batch = model(val_lhs)

#     val_positive_similarity = torch.bmm(val_l_batch,val_r_batch.transpose(2,1))
#     val_negative_similarity = torch.bmm(val_l_batch, val_neg_batch.transpose(2,1)).squeeze(1)

#     val_loss = torch.sum(torch.clamp(.1 - val_positive_similarity + val_negative_similarity, min=0))
#     if j % (print_every*batch_size) == 0:
#         print(val_loss)
    
#     val_accuracy_check = val_positive_similarity.squeeze(1) > val_negative_similarity[:,0].unsqueeze(1)
#     val_acc += np.sum(val_accuracy_check.detach().numpy())

In [None]:
# print(val_acc/len(valid))

In [None]:
# from collections import Counter

# def build_vocab(train, min_ct = 2):
#     ''' build vocabulary for an array/list/series of text '''
#     # To do: smaller groups before aggregating to improve performance
#     def wordcount_df(doc):
#         tok = doc.split()
#         d = pd.DataFrame.from_dict(Counter(tok),orient='index').reset_index().rename(columns={'index':'word'})
#         return d

#     d_list = [wordcount_df(x) for x in train]

#     d = pd.concat(d_list,axis=0)

#     d = d.groupby(['word'])[0].sum().sort_values(ascending=False)
#     d = d[d >= min_ct]
    
#     d = dict(zip(d.index.values, range(len(d))))
    
#     return d

### Begin the training loop

In [None]:
# batch = train[0:100]
# validation = train[100:150]
# batch_size = 100

# model.train()
# opt.zero_grad()

# lhs = batch.values

# l_batch, r_batch, neg_batch = model(lhs)

# positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1)) #this is the same as dot product by row

# negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

# loss = torch.mean(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))
# loss

# loss.backward(); opt.step()

### Calculate accuracy of predictions in current batch

In [None]:
# accuracy_check = positive_similarity.squeeze(1) > negative_similarity[:,0].unsqueeze(1)
# acc = np.mean(similarity_check.detach().numpy())
# print(acc)

In [None]:
# def embed_doc(d,vocab,embedding,normalize=False):
#     positions = []
#     for t in d:
#         try:
#             positions.append(vocab[t])
#         except KeyError:
#             pass
#     output = torch.sum(embedding(torch.LongTensor(positions)),dim=0)
#     if normalize:
#         output = output / output.norm()
#     return output

In [None]:
# # similarity
# l_batch = []
# r_batch = []
# neg_batch = []

# for i in range(len(batch)):
#     #Positive similarity
#     s = batch.values[i].split('\t') #sentences
#     if type(s) == str: #only one sentence in s
#         a = s
#         b = s
#     else:
#         a, b = np.random.choice(s, 2, False)
    
#     a = a.split()
#     b = b.split()
    
#     a_emb = embed_doc(a,train_vocab,input_embedding,normalize=True)
#     b_emb = embed_doc(b,train_vocab,input_embedding,normalize=True)
    
#     l_batch.append(a_emb)
#     r_batch.append(b_emb)

#     #Negative similarity
#     negs = []
#     for _i in range(k * 3):
#         index = np.random.choice(len(batch))
#         if not index == i: #if it's not from the same document
#             c = batch.values[index].split('\t')
#             c = np.random.choice(c, 1)[0].split()
#             c_emb = embed_doc(c,train_vocab,input_embedding,normalize=True)
#             negs.append(c_emb)
#             if(len(negs) >= k):
#                 break
    
#     neg_batch.append(torch.stack(negs))