In [1]:
# ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [2]:
# ! pip install --quiet fuzzywuzzy gcsfs

In [3]:
import time
import sys
import logging
import math
import random
import json
import pickle
from collections import Counter

import numpy as np
import pandas as pd

from gcsfs import GCSFileSystem

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
#import torchtext

from validation.data import *



In [4]:
# from src.model import StarSpace

In [5]:
# torch.autograd.set_detect_anomaly(True)

In [6]:
from src.adv_model import *

In [7]:
def nan_test(l_batch, r_batch, neg_batch):
    global nan_break
    
    l_test = np.isnan(np.mean(l_batch.detach().cpu().numpy()))
    r_test = np.isnan(np.mean(r_batch.detach().cpu().numpy()))
    neg_test = np.isnan(np.mean(neg_batch.detach().cpu().numpy()))
    if l_test or r_test or neg_test:
        nan_break = True
        return True

In [8]:
SAMPLE_SIZE = 100000
SOC_LEVEL = 3
OUTPUT_WEIGHTS = 'data/starspace/'

In [9]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print("Using device", device)

Using device cuda


In [10]:
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", 
                          datefmt="%Y-%m-%d - %H:%M:%S")
fh = logging.FileHandler("starspace.log", "w")
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [11]:
log.info('Pulling Indeed data for sample size %s' % SAMPLE_SIZE)

In [12]:
# Get job ads data
indeed = get_indeed_texts('../data/us/everything.csv',use_gcs=True,nrows=SAMPLE_SIZE)
indeed = indeed['content']

In [13]:
train = indeed.copy()
del indeed

In [14]:
log.info('About to train vocab')

In [15]:
embedder = Embedder(d_embed=100)
embedder.train_vocab(train)

print(len(embedder.vocab))
log.info('Trained Vocab of size %s' % str(len(embedder.vocab)))

35200


In [16]:
# #Save the file
with open(OUTPUT_WEIGHTS + 'train_vocab_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(embedder.vocab, f)

In [17]:
train_pos = embedder.get_positions(train)

for i in range(len(train_pos)):
    for j in range(len(train_pos[i])):
        train_pos[i][j] = train_pos[i][j].to(device)

In [18]:
starspace = StarSpaceAdv(
    input_embedder = embedder,
    k_neg = 10,
    lr = .01)

starspace.to(device)

StarSpaceAdv()

In [19]:
#Run parameters
epochs = 3
print_every = 100
log_every = 10
batch_size = 100

#Losses
star_losses = []
epoch_losses = [1e12]
log.info('Beginning run')

In [20]:
#Real loop
for epoch in range(epochs):
    permutation = torch.randperm(len(train_pos)).numpy()
    nan_break = False
    
    for i in range(0,len(train), batch_size):
        indices = permutation[i:i+batch_size]
        batch = train_pos[indices]

        ###### Forward Pass- StarSpace #############################################################
        starspace.train(); starspace.opt.zero_grad()
        
        l_batch, r_batch, neg_batch = starspace(batch)
        
        #Test for nans
        if nan_test(l_batch, r_batch, neg_batch):
            break
        
        positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1))
        negative_similarity = torch.mean(torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1),1)

        star_loss = torch.sum(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))
        
        ###### Backward Pass- Embeddings ##############################################################
        star_loss.backward(); starspace.opt.step()
                
        ###### Batch Print/Log #########################################################################
        star_losses.append(star_loss.detach().cpu().numpy())
        
        if (i % (print_every*batch_size) == 0) & (i > 0):
            print('star avg loss: %s' % str(np.mean(star_losses[-10:])))
        if (i % (log_every*batch_size) == 0) & (i > 0):
            log.info('star avg loss: %s' % str(np.mean(star_losses[-10:])))
    
    # End of inner loop
    if nan_break:
        print("you've got nans")
        log.warning("you've got nans")
        break
    
    ###### Epoch Print/Log #############################################################################
    print('Finished epoch %s at %s.' % (epoch,time.ctime()))
    log.info("Finished epoch %s" % str(epoch))
    
    epoch_loss = np.mean(star_losses[-int(SAMPLE_SIZE/batch_size):])

    if (epoch_loss < min(epoch_losses)) | (len(epoch_losses) == 0):
        print('best epoch so far!')
        log.info('best epoch so far!')
        
        weights = embedder.weights.weight
        with open(OUTPUT_WEIGHTS + 'weights_best_epoch', 'wb') as f:
            pickle.dump(weights.data.detach().cpu().numpy(), f)
    
    epoch_losses.append(epoch_loss)
    print(epoch_losses)

star avg loss: 808.9972
star avg loss: 729.0155
star avg loss: 562.63586
star avg loss: 583.05945
star avg loss: 538.04755
star avg loss: 453.54044
star avg loss: 497.20743
star avg loss: 474.2387
star avg loss: 417.88867
Finished epoch 0 at Thu Jun 25 13:29:14 2020.
best epoch so far!
[1000000000000.0, 569.3675]
star avg loss: 349.3924
star avg loss: 371.8249
star avg loss: 365.25375
star avg loss: 336.4627
star avg loss: 311.3178
star avg loss: 370.0086
star avg loss: 344.89343
star avg loss: 311.11246
star avg loss: 301.72195
Finished epoch 1 at Thu Jun 25 13:39:17 2020.
best epoch so far!
[1000000000000.0, 569.3675, 342.5795]
star avg loss: 268.3271
star avg loss: 281.04016
star avg loss: 315.76715
star avg loss: 264.69818
star avg loss: 271.99506
star avg loss: 278.49942
star avg loss: 251.47087
star avg loss: 272.0772
star avg loss: 282.75604
Finished epoch 2 at Thu Jun 25 13:49:47 2020.
best epoch so far!
[1000000000000.0, 569.3675, 342.5795, 273.62244]


In [21]:
weights = embedder.weights.weight
with open(OUTPUT_WEIGHTS + 'weights_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(weights.data.detach().cpu().numpy(), f)

In [22]:
print('You made it!')
log.info('You made it!')

You made it!


In [23]:
# #Save the weights to CSV
# weights = model.input_embedding.weight
# weights = weights.data.detach().numpy()
# np.savetxt("weights_%s.csv" % SAMPLE_SIZE, weights, delimiter=",")

In [24]:
epoch_losses

[1000000000000.0, 569.3675, 342.5795, 273.62244]