In [1]:
# ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [2]:
# ! pip install --quiet fuzzywuzzy gcsfs

In [3]:
import time
import sys
import logging
import math
import random
import json
import pickle
from collections import Counter

import numpy as np
import pandas as pd

from gcsfs import GCSFileSystem

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
#import torchtext

from validation.data import *



In [4]:
# from src.model import StarSpace

In [5]:
# torch.autograd.set_detect_anomaly(True)

In [6]:
from src.adv_model import *

In [7]:
def nan_test(l_batch, r_batch, neg_batch):
    global nan_break
    
    l_test = np.isnan(np.mean(l_batch.detach().cpu().numpy()))
    r_test = np.isnan(np.mean(r_batch.detach().cpu().numpy()))
    neg_test = np.isnan(np.mean(neg_batch.detach().cpu().numpy()))
    if l_test or r_test or neg_test:
        nan_break = True
        return True

In [8]:
SAMPLE_SIZE = 1000
SOC_LEVEL = 3
OUTPUT_WEIGHTS = 'data/separation/'

In [9]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print("Using device", device)

Using device cuda


In [10]:
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", 
                          datefmt="%Y-%m-%d - %H:%M:%S")
fh = logging.FileHandler("adversarial_model.log", "w")
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [11]:
log.info('Pulling Indeed data for sample size %s' % SAMPLE_SIZE)

In [12]:
# Get job ads data
indeed = get_indeed_texts('../data/us/everything.csv',use_gcs=True,nrows=SAMPLE_SIZE)
indeed = indeed['content']

In [13]:
train = indeed.copy()
del indeed

In [14]:
# Get DOT classifications data
dot, dot_labs = dot_train_data(SOC_LEVEL)

dot = dot.reset_index(drop=True)
dot_labs = dot_labs.reset_index(drop=True)

In [15]:
log.info('About to train vocab')

In [16]:
embedder = Embedder(d_embed=100)

In [17]:
embedder.train_vocab(train)

In [18]:
len(embedder.vocab)

4152

In [19]:
log.info('Trained Vocab of size %s' % str(len(embedder.vocab)))

In [20]:
# #Save the file
with open(OUTPUT_WEIGHTS + 'train_vocab_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(embedder.vocab, f)

In [21]:
model = StarSpaceAdv(
    input_embedder = embedder,
    k_neg = 10)

model.to(device)

StarSpaceAdv()

In [22]:
train_pos = embedder.get_positions(train)
dot_positions = embedder.get_positions(dot)

In [23]:
for i in range(len(train_pos)):
    for j in range(len(train_pos[i])):
        train_pos[i][j] = train_pos[i][j].to(device)

for i in range(len(dot_positions)):
    for j in range(len(dot_positions[i])):
        dot_positions[i][j] = dot_positions[i][j].to(device)

In [24]:
#Run parameters
epochs = 3
print_every = 1
log_every = 1
batch_size = 100

#Losses
losses = []
epoch_losses = [1e12]
log.info('Beginning run')

In [25]:
# permutation = torch.randperm(len(train_pos)).numpy()
# nan_break = False
# i = 0
# indices = permutation[i:i+batch_size]
# batch = train_pos[indices]
# l_batch, r_batch, neg_batch = model(batch)

# discriminator = LogisticRegression(100,2)
# discriminator.to(device)

# opt2 = torch.optim.Adam(discriminator.parameters(), lr=.01)

# losses = []
# for i in range(0,1000, batch_size):
#         indices = permutation[i:i+batch_size]
#         batch = train_pos[indices]
#         l_batch, r_batch, neg_batch = model(batch)

#         idx = torch.tensor(np.random.choice(dot_positions.shape[0], 100, False)).to(device)
#         disc_dot = dot_positions[torch.nonzero(idx).detach().cpu()]
#         dot_emb = [model.embed_doc(torch.cat(doc.tolist()[0])) for doc in disc_dot]
#         dot_emb = torch.stack(dot_emb).to(device)
#         disc_X = torch.cat([l_batch.squeeze(1), dot_emb], 0).to(device)
#         disc_y = torch.cat([torch.zeros(100),torch.ones(100)],0).type(torch.LongTensor).to(device)

#         opt2.zero_grad()
#         disc_out = discriminator(disc_X)
#         loss = criterion(disc_out,disc_y)
#         loss.backward(retain_graph=True)
#         opt2.step()
        
#         losses.append(loss.cpu())

In [26]:
#break cell
asdlfjalsdfj!

SyntaxError: invalid syntax (<ipython-input-26-b9080f1fb250>, line 2)

In [43]:
model.embedder.weights.weight.detach().cpu().numpy()

TypeError: can't convert non-cpu tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [29]:
weights = model.embedder.weights
with open(OUTPUT_WEIGHTS + 'weights_best_epoch', 'wb') as f:
    pickle.dump(weights.data.detach().cpu().numpy(), f)

AttributeError: 'Embedding' object has no attribute 'data'

In [44]:
#Real loop
for epoch in range(epochs):
    permutation = torch.randperm(len(train_pos)).numpy()
    nan_break = False
    
    for i in range(0,len(train), batch_size):
        indices = permutation[i:i+batch_size]
        batch = train_pos[indices]

        ###### Forward Pass- StarSpace #############################################################
        model.train(); model.opt.zero_grad()
        
        l_batch, r_batch, neg_batch = model(batch)
        
        #Test for nans
#         if nan_test(l_batch, r_batch, neg_batch):
#             break
        
        positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1))
        negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

        star_loss = torch.sum(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))
                
        loss = star_loss

        loss.backward(); model.opt.step()

        ###### Print/Log #############################################################################
        losses.append(loss.detach().cpu().numpy())

        if (i % (print_every*batch_size) == 0) & (i > 0):
            print('star avg loss: %s' % str(np.mean(losses[-10:])))
        if (i % (log_every*batch_size) == 0) & (i > 0):
            log.info('star avg loss: %s' % str(np.mean(losses[-10:])))
    
    # End of inner loop
    if nan_break:
        print("you've got nans")
        log.warning("you've got nans")
        break
    
    ###### Print/Log #############################################################################
    print('Finished epoch %s at %s.' % (epoch,time.ctime()))
    log.info("Finished epoch %s" % str(epoch))
    
    epoch_loss = np.mean(losses[-int(SAMPLE_SIZE/batch_size):])

    if (epoch_loss < min(epoch_losses)) | (len(epoch_losses) == 0):
        print('best epoch so far!')
        log.info('best epoch so far!')
        
        weights = model.embedder.weights.weight
        with open(OUTPUT_WEIGHTS + 'weights_best_epoch', 'wb') as f:
            pickle.dump(weights.data.detach().cpu().numpy(), f)
    
    epoch_losses.append(epoch_loss)
    print(epoch_losses)

star avg loss: 24539544.0
star avg loss: 23264274.0
star avg loss: 20416952.0
star avg loss: 18061060.0
star avg loss: 18256716.0
star avg loss: 17743540.0
star avg loss: 16363587.0
star avg loss: 15986549.0
star avg loss: 15781394.0
Finished epoch 0 at Wed Jun 24 14:49:25 2020.
best epoch so far!
[1000000000000.0, 15781394.0]
star avg loss: 14147595.0
star avg loss: 13882734.0
star avg loss: 13495282.0
star avg loss: 13643106.0
star avg loss: 12619786.0
star avg loss: 12651068.0
star avg loss: 12672415.0
star avg loss: 12956690.0
star avg loss: 12385585.0
Finished epoch 1 at Wed Jun 24 14:49:30 2020.
best epoch so far!
[1000000000000.0, 15781394.0, 12385585.0]
star avg loss: 12671940.0
star avg loss: 12781222.0
star avg loss: 13202035.0
star avg loss: 13463298.0
star avg loss: 13331326.0
star avg loss: 12956054.0
star avg loss: 12599894.0
star avg loss: 11775056.0
star avg loss: 11524094.0
Finished epoch 2 at Wed Jun 24 14:49:35 2020.
best epoch so far!
[1000000000000.0, 15781394.0, 1

In [None]:
weights = model.embeddings.weight
with open(OUTPUT_WEIGHTS + 'weights_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(weights.data.detach().cpu().numpy(), f)

In [None]:
print('You made it!')
log.info('You made it!')

In [None]:
epoch_losses_sep

In [None]:
# #Save the weights to CSV
# weights = model.input_embedding.weight
# weights = weights.data.detach().numpy()
# np.savetxt("weights_%s.csv" % SAMPLE_SIZE, weights, delimiter=",")