In [1]:
# ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [2]:
# ! pip install --quiet fuzzywuzzy gcsfs

In [3]:
import time
import sys
import logging
import math
import random
import json
import pickle
from collections import Counter

import numpy as np
import pandas as pd

from gcsfs import GCSFileSystem

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
#import torchtext

from validation.data import *



In [4]:
# from src.model import StarSpace

In [5]:
# torch.autograd.set_detect_anomaly(True)

In [6]:
from src.adv_model import *

In [7]:
def nan_test(l_batch, r_batch, neg_batch):
    global nan_break
    
    l_test = np.isnan(np.mean(l_batch.detach().cpu().numpy()))
    r_test = np.isnan(np.mean(r_batch.detach().cpu().numpy()))
    neg_test = np.isnan(np.mean(neg_batch.detach().cpu().numpy()))
    if l_test or r_test or neg_test:
        nan_break = True
        return True

In [8]:
SAMPLE_SIZE = 10000
SOC_LEVEL = 3
OUTPUT_WEIGHTS = 'data/adversarial/'

In [9]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print("Using device", device)

Using device cuda


In [10]:
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", 
                          datefmt="%Y-%m-%d - %H:%M:%S")
fh = logging.FileHandler("adversarial_model.log", "w")
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [11]:
log.info('Pulling Indeed data for sample size %s' % SAMPLE_SIZE)

In [12]:
# Get job ads data
indeed = get_indeed_texts('../data/us/everything.csv',use_gcs=True,nrows=SAMPLE_SIZE)
indeed = indeed['content']

In [13]:
train = indeed.copy()
del indeed

In [14]:
# Get DOT classifications data
dot, dot_labs = dot_train_data(SOC_LEVEL)

dot = dot.reset_index(drop=True)
dot_labs = dot_labs.reset_index(drop=True)

In [15]:
log.info('About to train vocab')

In [16]:
embedder = Embedder(d_embed=100)
embedder.train_vocab(train)

print(len(embedder.vocab))
log.info('Trained Vocab of size %s' % str(len(embedder.vocab)))

12431


In [17]:
# #Save the file
with open(OUTPUT_WEIGHTS + 'train_vocab_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(embedder.vocab, f)

In [18]:
train_pos = embedder.get_positions(train)
dot_positions = embedder.get_positions(dot)

for i in range(len(train_pos)):
    for j in range(len(train_pos[i])):
        train_pos[i][j] = train_pos[i][j].to(device)

for i in range(len(dot_positions)):
    for j in range(len(dot_positions[i])):
        dot_positions[i][j] = dot_positions[i][j].to(device)

In [19]:
starspace = StarSpaceAdv(
    input_embedder = embedder,
    k_neg = 10)

starspace.to(device)

StarSpaceAdv()

In [20]:
discriminator = Discriminator(
    classifier = LogisticRegression(100,2),
    criterion = nn.CrossEntropyLoss(),
    embedder = embedder,
    learning_rates = [.01, .01])

In [21]:
#Run parameters
epochs = 3
print_every = 10
log_every = 10
batch_size = 100

#Losses
star_losses = []
disc_losses = []
epoch_losses = [1e12]
log.info('Beginning run')

In [22]:
#Real loop
for epoch in range(epochs):
    permutation = torch.randperm(len(train_pos)).numpy()
    nan_break = False
    
    for i in range(0,len(train), batch_size):
        indices = permutation[i:i+batch_size]
        batch = train_pos[indices]

        ###### Forward Pass- StarSpace #############################################################
        starspace.train(); starspace.opt.zero_grad()
        
        l_batch, r_batch, neg_batch = starspace(batch)
        
        #Test for nans
        if nan_test(l_batch, r_batch, neg_batch):
            break
        
        positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1))
        negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

        star_loss = torch.sum(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))

        star_loss.backward(); starspace.opt.step()
        
        ###### Forward Pass- Discriminator ###########################################################
        discriminator.train(); discriminator.opt.zero_grad()
        disc_loss, outputs, disc_y = discriminator(batch, dot_positions)

        acc = 100 * (outputs.max(1).indices == disc_y).sum().item() / 200

        disc_loss.backward(); discriminator.opt.step()
        
        ###### Batch Print/Log #############################################################################
        star_losses.append(star_loss.detach().cpu().numpy())
        disc_losses.append(acc)
        
        if (i % (print_every*batch_size) == 0) & (i > 0):
            print('star avg loss: %s' % str(np.mean(star_losses[-10:])))
            print('discriminator accuracy: %s' % str(np.mean(disc_losses[-10:])))
        if (i % (log_every*batch_size) == 0) & (i > 0):
            log.info('star avg loss: %s' % str(np.mean(star_losses[-10:])))
            log.info('discriminator accuracy: %s' % str(np.mean(disc_losses[-10:])))
    
    # End of inner loop
    if nan_break:
        print("you've got nans")
        log.warning("you've got nans")
        break
    
    ###### Epoch Print/Log #############################################################################
    print('Finished epoch %s at %s.' % (epoch,time.ctime()))
    log.info("Finished epoch %s" % str(epoch))
    
    epoch_loss = np.mean(star_losses[-int(SAMPLE_SIZE/batch_size):])

    if (epoch_loss < min(epoch_losses)) | (len(epoch_losses) == 0):
        print('best epoch so far!')
        log.info('best epoch so far!')
        
        weights = embedder.weights.weight
        with open(OUTPUT_WEIGHTS + 'weights_best_epoch', 'wb') as f:
            pickle.dump(weights.data.detach().cpu().numpy(), f)
    
    epoch_losses.append(epoch_loss)
    print(epoch_losses)

star avg loss: 23194972.0
discriminator accuracy: 41.9
star avg loss: 20329916.0
discriminator accuracy: 28.8
star avg loss: 26900420.0
discriminator accuracy: 12.55
star avg loss: 62508290.0
discriminator accuracy: 3.95
star avg loss: 110018130.0
discriminator accuracy: 1.75


KeyboardInterrupt: 

In [None]:
weights = model.embeddings.weight
with open(OUTPUT_WEIGHTS + 'weights_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(weights.data.detach().cpu().numpy(), f)

In [None]:
print('You made it!')
log.info('You made it!')

In [None]:
epoch_losses_sep

In [None]:
# #Save the weights to CSV
# weights = model.input_embedding.weight
# weights = weights.data.detach().numpy()
# np.savetxt("weights_%s.csv" % SAMPLE_SIZE, weights, delimiter=",")

### Testing the weights changing

In [None]:
# epoch = 0
# permutation = torch.randperm(len(train_pos)).numpy()
# nan_break = False
# i = 0

# indices = permutation[i:i+batch_size]
# batch = train_pos[indices]

# def flatten_params(params):
#     disc_p = []
#     for p in params:
#         disc_p.append(p.clone().detach().cpu().numpy())

#     return np.array([item for sublist in disc_p for item in sublist.flatten()])

# ew_1 = embedder.weights.weight.clone().detach().cpu().numpy().flatten()
# disc_1 = flatten_params(discriminator.classifier.parameters())

# ###### Forward Pass- StarSpace #############################################################
# starspace.train(); starspace.opt.zero_grad()

# l_batch, r_batch, neg_batch = starspace(batch)

#Test for nans
# if nan_test(l_batch, r_batch, neg_batch):
#     break

# positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1))
# negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

# star_loss = torch.sum(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))

# star_loss.backward(); starspace.opt.step()

# ew_2 = embedder.weights.weight.clone().detach().cpu().numpy().flatten()
# disc_2 = flatten_params(discriminator.classifier.parameters())

# print('Embeddings changed: %s' % any(ew_1 != ew_2))
# print('Discriminator params updated: %s' % any(disc_1 != disc_2))

# ###### Forward Pass- Discriminator ###########################################################
# discriminator.train(); discriminator.opt.zero_grad()
# disc_loss, outputs, disc_y = discriminator(batch, dot_positions)

# acc = 100 * (outputs.max(1).indices == disc_y).sum().item() / 200

# disc_loss.backward(); discriminator.opt.step()

# ew_3 = embedder.weights.weight.clone().detach().cpu().numpy().flatten()
# disc_3 = flatten_params(discriminator.classifier.parameters())

# print('Embeddings changed: %s' % any(ew_2 != ew_3))
# print('Discriminator params updated: %s' % any(disc_2 != disc_3))