In [1]:
# ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [2]:
# ! pip install --quiet fuzzywuzzy gcsfs

In [3]:
import time
import sys
import logging
import math
import random
import json
import pickle
from collections import Counter

import numpy as np
import pandas as pd

from gcsfs import GCSFileSystem
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import davies_bouldin_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
#import torchtext

from validation.data import *



In [4]:
from src.model import StarSpace

In [5]:
SAMPLE_SIZE = 100000
SOC_LEVEL = 3
OUTPUT_WEIGHTS = 'data/separation/'

In [6]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f8e00349588>

In [7]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print("Using device", device)

Using device cuda


In [8]:
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", 
                          datefmt="%Y-%m-%d - %H:%M:%S")
fh = logging.FileHandler("separation_model.log", "w")
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [9]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [10]:
def nan_test(l_batch, r_batch, neg_batch):
    global nan_break
    
    l_test = np.isnan(np.mean(l_batch.detach().cpu().numpy()))
    r_test = np.isnan(np.mean(r_batch.detach().cpu().numpy()))
    neg_test = np.isnan(np.mean(neg_batch.detach().cpu().numpy()))
    if l_test or r_test or neg_test:
        nan_break = True
        return True

In [11]:
log.info('Pulling Indeed data for sample size %s' % SAMPLE_SIZE)

In [12]:
# Get job ads data
indeed = get_indeed_texts('../data/us/everything.csv',use_gcs=True,nrows=SAMPLE_SIZE)
indeed = indeed['content']

In [13]:
train = indeed.copy()
del indeed

In [14]:
# Get DOT classifications data
dot, dot_labs = dot_train_data(SOC_LEVEL)

dot = dot.reset_index(drop=True)
dot_labs = dot_labs.reset_index(drop=True)

In [15]:
log.info('About to train vocab')

In [16]:
Vectorizer = CountVectorizer(min_df = 10,
                             max_df = .99)
Vectorizer.fit(train)

train_vocab = Vectorizer.vocabulary_

In [17]:
len(train_vocab)

24968

In [18]:
log.info('Trained Vocab of size %s' % str(len(train_vocab)))

In [19]:
# #Save the file
with open(OUTPUT_WEIGHTS + 'train_vocab_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(train_vocab, f)

In [20]:
# To start from file:
# with open('data/separation/weights_100000', 'rb') as f:
#     embeddings = pickle.load(f)

# print(embeddings.shape)
# embeddings = torch.FloatTensor(embeddings)
# embeddings = nn.Embedding.from_pretrained(embeddings)

# with open('data/separation/train_vocab_100000', 'rb') as f:
#     vocab = pickle.load(f)

In [21]:
model = StarSpace(
    d_embed=100,
    vocabulary=train_vocab,
    k_neg = 10)
#     input_embedding = embeddings)

model.to(device)

StarSpace(
  (embeddings): Embedding(24968, 100, max_norm=20)
)

In [22]:
#LR params
clusters_to_sample = 10
lr_steps = 3
cluster_nums = np.array(list(set(dot_labs)))

In [23]:
LR = LogisticRegression(100,clusters_to_sample)
LR.to(device)
criterion = torch.nn.CrossEntropyLoss()
#lr_optim = torch.optim.SGD(LR.parameters(), lr=0.001)

In [24]:
lr = .01
opt = torch.optim.Adam(list(model.parameters()) + list(LR.parameters()), lr=lr)
#opt = torch.optim.Adam(model.parameters(), lr=lr)

In [25]:
train_pos = model.get_positions(train)
dot_positions = model.get_positions(dot)

for i in range(len(train_pos)):
    for j in range(len(train_pos[i])):
        train_pos[i][j] = train_pos[i][j].to(device)

for i in range(len(dot_positions)):
    for j in range(len(dot_positions[i])):
        dot_positions[i][j] = dot_positions[i][j].to(device)

In [26]:
#Run parameters
epochs = 3
print_every = 100
log_every = 10
batch_size = 100

#Losses
losses = []
separation_losses = []
epoch_losses = [1e12]
log.info('Beginning run')

In [27]:
# dot_sample = dot_positions.copy()
# dot_y_sample = dot_labs.copy()
# dot_y_enc = torch.tensor(np.unique(dot_y_sample,return_inverse=True)[1]) #encoded

# dot_y_enc = dot_y_enc.to(device)

In [28]:
# epochs = 2 #to add epochs

In [29]:
#Real loop
for epoch in range(epochs):
    permutation = torch.randperm(len(train_pos)).numpy()
    nan_break = False
    
    for i in range(0,len(train), batch_size):
        indices = permutation[i:i+batch_size]
        batch = train_pos[indices]

        model.train(); opt.zero_grad()
        
        l_batch, r_batch, neg_batch = model(batch)
        
        #Test for nans
#         if nan_test(l_batch, r_batch, neg_batch):
#             break
        
        positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1))
        negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

        star_loss = torch.sum(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))
        
        for param in model.parameters():
            param.requires_grad = False
        
        for param in LR.parameters():
            param.requires_grad = True
        
        # Now add in clustering loss for DOT categories
        clusts = np.random.choice(cluster_nums, clusters_to_sample, False)
        mask = np.in1d(dot_labs,clusts)
        
        dot_sample = dot_positions[mask].copy()
        dot_y_sample = dot_labs[mask].reset_index(drop=True)

        dot_y_enc = torch.tensor(np.unique(dot_y_sample,return_inverse=True)[1]).to(device) #encoded
        
#         LR = LogisticRegression(100,clusters_to_sample)
#         LR.to(device)
        new_dots = [torch.cat(doc) for doc in dot_sample]
        dot_emb = [model.embed_doc(doc) for doc in new_dots]
        dot_emb = torch.stack(dot_emb)
        dot_emb.to(device)
        
        for r in range(lr_steps):
            opt.zero_grad()

            output = LR(dot_emb)
            separation_loss = criterion(output,dot_y_enc)
            separation_loss.backward(retain_graph=True)
            opt.step()         
        
        opt.zero_grad()
        
        for param in model.parameters():
            param.requires_grad = True
        
        for param in LR.parameters():
            param.requires_grad = False
        
        #lr_optim.zero_grad()
        
        #Combine losses
        loss = star_loss + separation_loss

        loss.backward();opt.step()

        losses.append(loss.detach().cpu().numpy())
        separation_losses.append(separation_loss.detach().cpu().numpy())

        if i % (print_every*batch_size) == 0:
            print('separation avg loss: %s' % str(np.mean(separation_losses[-10:])))
            print('star avg loss: %s' % str(np.mean(losses[-10:])))
        if i % (log_every*batch_size) == 0:
            log.info('separation avg loss: %s' % str(np.mean(separation_losses[-10:])))
            log.info('star avg loss: %s' % str(np.mean(losses[-10:])))
    
    # End of inner loop
    if nan_break:
        print("you've got nans")
        log.warning("you've got nans")
        break
    
    print('Finished epoch %s at %s.' % (epoch,time.ctime()))
    log.info("Finished epoch %s" % str(epoch))
    
    epoch_loss = np.mean(losses[(len(losses)-100):])
    
    if epoch_loss < min(epoch_losses):
        print('best epoch so far!')
        log.info('best epoch so far!')
        
        weights = model.embeddings.weight
        with open(OUTPUT_WEIGHTS + 'weights_best_epoch', 'wb') as f:
            pickle.dump(weights.data.detach().cpu().numpy(), f)
    
    epoch_losses.append(epoch_loss)
    print(epoch_losses)

separation avg loss: 4.7770324
star avg loss: 24770546.0
separation avg loss: 3.814164
star avg loss: 9369560.0
separation avg loss: 3.3791816
star avg loss: 8057316.0
separation avg loss: 4.1539197
star avg loss: 6445255.5
separation avg loss: 4.4883084
star avg loss: 6343706.0
separation avg loss: 3.7073474
star avg loss: 5537010.5
separation avg loss: 2.9170423
star avg loss: 5038729.5
separation avg loss: 3.8191924
star avg loss: 4724438.0
separation avg loss: 3.4805877
star avg loss: 4926741.0
separation avg loss: 6.098277
star avg loss: 4231895.0
Finished epoch 0 at Mon Jun 22 16:20:27 2020.
best epoch so far!
[1000000000000.0, 4583498.0]
separation avg loss: 4.3829336
star avg loss: 4075688.8
separation avg loss: 3.5581594
star avg loss: 3838062.5
separation avg loss: 3.516644
star avg loss: 3556132.0
separation avg loss: 4.4959025
star avg loss: 3541489.2
separation avg loss: 4.616294
star avg loss: 3475111.5
separation avg loss: 3.5470066
star avg loss: 3523014.0
separation av

In [33]:
weights = model.embeddings.weight
with open(OUTPUT_WEIGHTS + 'weights_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(weights.data.detach().cpu().numpy(), f)

In [34]:
print('You made it!')
log.info('You made it!')

You made it!


In [32]:
# #Save the weights to CSV
# weights = model.input_embedding.weight
# weights = weights.data.detach().numpy()
# np.savetxt("weights_%s.csv" % SAMPLE_SIZE, weights, delimiter=",")