In [1]:
# ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [2]:
# ! pip install --quiet fuzzywuzzy gcsfs

In [3]:
import time
import sys
import logging
import math
import random
import json
import pickle
from collections import Counter

import numpy as np
import pandas as pd

from gcsfs import GCSFileSystem
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import davies_bouldin_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
#import torchtext

from validation.data import *



In [4]:
from src.model import StarSpace

In [5]:
SAMPLE_SIZE = 10000
SOC_LEVEL = 3
OUTPUT_WEIGHTS = 'data/separation/'

In [6]:
# torch.autograd.set_detect_anomaly(True)

In [7]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print("Using device", device)

Using device cuda


In [8]:
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", 
                          datefmt="%Y-%m-%d - %H:%M:%S")
fh = logging.FileHandler("separation_model.log", "w")
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [9]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [10]:
def nan_test(l_batch, r_batch, neg_batch):
    global nan_break
    
    l_test = np.isnan(np.mean(l_batch.detach().cpu().numpy()))
    r_test = np.isnan(np.mean(r_batch.detach().cpu().numpy()))
    neg_test = np.isnan(np.mean(neg_batch.detach().cpu().numpy()))
    if l_test or r_test or neg_test:
        nan_break = True
        return True

In [11]:
log.info('Pulling Indeed data for sample size %s' % SAMPLE_SIZE)

In [12]:
# Get job ads data
indeed = get_indeed_texts('../data/us/everything.csv',use_gcs=True,nrows=SAMPLE_SIZE)
indeed = indeed['content']

In [13]:
train = indeed.copy()
del indeed

In [14]:
# Get DOT classifications data
dot, dot_labs = dot_train_data(SOC_LEVEL)

dot = dot.reset_index(drop=True)
dot_labs = dot_labs.reset_index(drop=True)

In [15]:
log.info('About to train vocab')

In [16]:
Vectorizer = CountVectorizer(min_df = 5,
                             max_df = .99)
Vectorizer.fit(train)

train_vocab = Vectorizer.vocabulary_

In [17]:
len(train_vocab)

12431

In [18]:
log.info('Trained Vocab of size %s' % str(len(train_vocab)))

In [19]:
# #Save the file
with open(OUTPUT_WEIGHTS + 'train_vocab_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(train_vocab, f)

In [20]:
model = StarSpace(
    d_embed=100,
    vocabulary=train_vocab,
    k_neg = 10)
#     input_embedding = embeddings)

model.to(device)

StarSpace(
  (embeddings): Embedding(12431, 100, max_norm=20)
)

In [21]:
#LR params
clusters_to_sample = 10
dot_y_enc = torch.tensor(np.unique(dot_labs,return_inverse=True)[1]).to(device) #encoded
num_clusters = len(set(dot_labs))

In [22]:
LR = LogisticRegression(100,num_clusters)
LR.to(device)
criterion = torch.nn.CrossEntropyLoss()

In [23]:
#opt = torch.optim.Adam(list(model.parameters()) + list(LR.parameters()), lr=lr)
opt = torch.optim.Adam([
    {'params': model.parameters(), 'lr': 1e-2},
    {'params': LR.parameters(), 'lr': 1e-2}
            ])


In [24]:
train_pos = model.get_positions(train)
dot_positions = model.get_positions(dot)

for i in range(len(train_pos)):
    for j in range(len(train_pos[i])):
        train_pos[i][j] = train_pos[i][j].to(device)

for i in range(len(dot_positions)):
    for j in range(len(dot_positions[i])):
        dot_positions[i][j] = dot_positions[i][j].to(device)

In [25]:
#Run parameters
epochs = 3
print_every = 10
log_every = 10
batch_size = 100

#Losses
error_lambdas = dict([['starspace',.1],
                      ['separation',.9]])
losses = []
separation_losses = []
epoch_losses = [1e12]
epoch_losses_sep = [1e12]
log.info('Beginning run')

In [26]:
#Real loop
for epoch in range(epochs):
    permutation = torch.randperm(len(train_pos)).numpy()
    nan_break = False
    
    for i in range(0,len(train), batch_size):
        indices = permutation[i:i+batch_size]
        batch = train_pos[indices]

        ###### Forward Pass- StarSpace #############################################################
        model.train(); opt.zero_grad()
        
        l_batch, r_batch, neg_batch = model(batch)
        
        #Test for nans
#         if nan_test(l_batch, r_batch, neg_batch):
#             break
        
        positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1))
        negative_similarity = torch.mean(torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1),1)

        star_loss = torch.sum(torch.clamp(.1 - positive_similarity + \
                                          negative_similarity, min=0)) * error_lambdas['starspace']
        
        ###### Forward pass- Separation #############################################################
        clusts = torch.tensor(np.random.choice(num_clusters, clusters_to_sample, False)).to(device)
        mask = dot_y_enc.view(1, -1).eq(clusts.view(-1,1)).sum(0)

        dot_sample = dot_positions[torch.nonzero(mask).detach().cpu()]
        dot_y_sample = dot_y_enc[np.where(mask.cpu())]

#         LR = LogisticRegression(100,clusters_to_sample)
#         LR.to(device)
        dot_emb = [model.embed_doc(torch.cat(doc.tolist()[0])) for doc in dot_sample]
        dot_emb = torch.stack(dot_emb)
        dot_emb.to(device)
        
        #opt.zero_grad()

        output = LR(dot_emb)
        separation_loss = criterion(output,dot_y_sample) * error_lambdas['separation']
                
        ###### Combine Losses/Backward Pass ##########################################################
        loss = star_loss + separation_loss

        loss.backward();opt.step()

        ###### Print/Log #############################################################################
        losses.append(loss.detach().cpu().numpy())
        separation_losses.append(separation_loss.detach().cpu().numpy())

        if (i % (print_every*batch_size) == 0) & (i > 0):
            print('star avg loss: %s' % str(np.mean(losses[-10:])))
            print('separation avg loss: %s' % str(np.mean(separation_losses[-10:])))
        if (i % (log_every*batch_size) == 0) & (i > 0):
            log.info('star avg loss: %s' % str(np.mean(losses[-10:])))
            log.info('separation avg loss: %s' % str(np.mean(separation_losses[-10:])))
    
    # End of inner loop
    if nan_break:
        print("you've got nans")
        log.warning("you've got nans")
        break
    
    ###### Print/Log #############################################################################
    print('Finished epoch %s at %s.' % (epoch,time.ctime()))
    log.info("Finished epoch %s" % str(epoch))
    
    epoch_loss = np.mean(losses[-int(SAMPLE_SIZE/batch_size):])
    epoch_loss_sep = np.mean(separation_losses[-int(SAMPLE_SIZE/batch_size):])

    if (epoch_loss < min(epoch_losses)) | (len(epoch_losses) == 0):
        print('best epoch so far!')
        log.info('best epoch so far!')
        log.info('')
        
        weights = model.embeddings.weight
        with open(OUTPUT_WEIGHTS + 'weights_best_epoch', 'wb') as f:
            pickle.dump(weights.data.detach().cpu().numpy(), f)
    
    epoch_losses.append(epoch_loss)
    epoch_losses_sep.append(epoch_loss_sep)
    print(epoch_losses)
    print(epoch_losses_sep)

	nonzero(Tensor input, Tensor out)
Consider using one of the following signatures instead:
	nonzero(Tensor input, bool as_tuple)


star avg loss: 103.778915
separation avg loss: 4.0870705
star avg loss: 98.53531
separation avg loss: 4.0109043
star avg loss: 93.81991
separation avg loss: 3.8870444
star avg loss: 93.966286
separation avg loss: 3.8611388
star avg loss: 88.04742
separation avg loss: 3.7865272
star avg loss: 89.978165
separation avg loss: 3.7889385
star avg loss: 85.53112
separation avg loss: 3.7750823
star avg loss: 86.63531
separation avg loss: 3.5677955
star avg loss: 83.481895
separation avg loss: 3.4620667
Finished epoch 0 at Thu Jun 25 17:23:00 2020.
best epoch so far!
[1000000000000.0, 90.6501]
[1000000000000.0, 3.7850442]
star avg loss: 72.34969
separation avg loss: 3.6584773
star avg loss: 73.39623
separation avg loss: 3.4627755
star avg loss: 68.97413
separation avg loss: 3.4005108
star avg loss: 69.465225
separation avg loss: 3.4632163
star avg loss: 66.5389
separation avg loss: 3.36913
star avg loss: 67.844246
separation avg loss: 3.3363461
star avg loss: 70.855515
separation avg loss: 3.50

In [27]:
weights = model.embeddings.weight
with open(OUTPUT_WEIGHTS + 'weights_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(weights.data.detach().cpu().numpy(), f)

In [28]:
print('You made it!')
log.info('You made it!')

You made it!


In [29]:
epoch_losses

[1000000000000.0, 90.6501, 68.48818, 57.82406]

In [30]:
epoch_losses_sep

[1000000000000.0, 3.7850442, 3.4272268, 3.1811469]

In [31]:
# #Save the weights to CSV
# weights = model.input_embedding.weight
# weights = weights.data.detach().numpy()
# np.savetxt("weights_%s.csv" % SAMPLE_SIZE, weights, delimiter=",")

### Old