In [1]:
# ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [2]:
# ! pip install --quiet fuzzywuzzy gcsfs

In [3]:
import time
import sys
import logging
import math
import random
import json
import pickle
from collections import Counter

import numpy as np
import pandas as pd

from gcsfs import GCSFileSystem
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import davies_bouldin_score

import torch
import torch.nn as nn
from torch import optim
#import torchtext

from validation.data import *



In [4]:
from src.model import StarSpace

In [5]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print("Using device", device)

Using device cuda


In [6]:
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", 
                          datefmt="%Y-%m-%d - %H:%M:%S")
fh = logging.FileHandler("separation_model.log", "w")
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [7]:
SAMPLE_SIZE = 10000
SOC_LEVEL = 3
OUTPUT_WEIGHTS = 'data/separation/'

In [8]:
log.info('Pulling Indeed data for sample size %s' % SAMPLE_SIZE)

In [9]:
# Get job ads data
indeed = get_indeed_texts('../data/us/everything.csv',use_gcs=True,nrows=SAMPLE_SIZE)
indeed = indeed['content']

In [10]:
train = indeed.copy()
del indeed

In [11]:
# Get DOT classifications data
dot, dot_labs = dot_train_data(SOC_LEVEL)

dot.reset_index(drop=True,inplace=True)
dot_labs.reset_index(drop=True,inplace=True)

In [12]:
log.info('About to train vocab')

In [13]:
Vectorizer = CountVectorizer(min_df = 10,
                             max_df = .99)
Vectorizer.fit(train)

train_vocab = Vectorizer.vocabulary_

In [14]:
len(train_vocab)

8881

In [15]:
log.info('Trained Vocab of size %s' % str(len(train_vocab)))

In [16]:
# #Save the file
with open(OUTPUT_WEIGHTS + 'train_vocab_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(train_vocab, f)

In [17]:
# To start from file:
# with open('data/separation/weights_100000', 'rb') as f:
#     embeddings = pickle.load(f)

# print(embeddings.shape)
# embeddings = torch.FloatTensor(embeddings)
# embeddings = nn.Embedding.from_pretrained(embeddings)

# with open('data/separation/train_vocab_100000', 'rb') as f:
#     vocab = pickle.load(f)

In [42]:
model = StarSpace(
    d_embed=100,
    vocabulary=train_vocab,
    k_neg = 10)
#     input_embedding = embeddings)

model.to(device)

lr = .01
opt = torch.optim.Adam(model.parameters(), lr=lr)

In [43]:
train_pos = model.get_positions(train)
dot_positions = model.get_positions(dot)

In [44]:
for i in range(len(train_pos)):
    for j in range(len(train_pos[i])):
        train_pos[i][j] = train_pos[i][j].to(device)

In [45]:
for i in range(len(dot_positions)):
    for j in range(len(dot_positions[i])):
        dot_positions[i][j] = dot_positions[i][j].to(device)

In [54]:
def davies_bouldin_torch(X, labels):
    n_cluster = len(set(labels))
    cluster_vals = list(set(labels))

    cluster_k = [X[labels==k,:] for k in cluster_vals]

    centroids = [torch.mean(k, axis = 0) for k in cluster_k]
    cent_dict = dict(zip(cluster_vals,centroids))

    variances = [torch.mean(torch.stack([torch.dist(p, centroids[i]) for p in k])) for i, k in enumerate(cluster_k)]
    var_dict = dict(zip(cluster_vals,variances))

    db = torch.empty((n_cluster,n_cluster)).to(device)
    eps = 1e-10 #for stability in denominator

    for i,k1 in enumerate(cluster_vals):
        for j,k2 in enumerate(cluster_vals):
            if (k1 != k2) & (i > j):
                result = (var_dict[k1] + var_dict[k2]) / (torch.dist(cent_dict[k1], cent_dict[k2]) + eps)
                db[i,j] = result
                db[j,i] = result
                
    result = torch.sum(torch.max(db,dim=0).values) / n_cluster
    if result > 10:
        return db
    return result

In [47]:
epochs = 3
print_every = 1
log_every = 10
batch_size = 100

losses = []
separation_losses = []
epoch_losses = [1e12]
log.info('Beginning run')

In [63]:
while(x < 10):
    x = davies_bouldin_torch(dot_emb,dot_y_sample)
    print('.')

.
.
.


RuntimeError: bool value of Tensor with more than one value is ambiguous

In [66]:
torch.sum(torch.max(x,dim=0).values) / len(set(dot_y_sample))

tensor(1.2557e+10, device='cuda:0', grad_fn=<DivBackward0>)

In [71]:
y = torch.max(x,dim=0).values
y

tensor([6.5185e+00, 7.0466e+00, 9.8232e+00, 6.3740e+00, 8.1572e+00, 7.6526e+00,
        6.2703e+00, 9.8232e+00, 5.7988e+00, 5.8477e+00, 6.3327e+00, 5.0389e+00,
        5.6420e+00, 7.3161e+00, 6.1348e+00, 9.4889e-01, 3.8429e+00, 5.8048e+00,
        4.8761e+00, 8.0756e+00, 8.5401e+00, 1.0237e+01, 8.3883e+00, 6.8071e+00,
        5.1875e+00, 5.3816e+00, 1.0237e+01, 9.2545e+00, 7.6173e+00, 7.6976e+00,
        8.1722e+00, 8.6940e+00, 8.5247e+00, 4.0819e+00, 5.2448e+00, 4.9542e+00,
        6.5773e+00, 6.2157e+00, 6.1479e+00, 5.7605e+00, 5.9457e+00, 6.8397e+00,
        8.5699e+00, 7.6187e+00, 5.9659e+00, 1.1747e+01, 1.1747e+01, 6.9087e+00,
        3.5392e+00, 4.6061e+00, 6.9007e+00, 7.1450e+00, 4.8003e+00, 6.3702e+00,
        6.1116e+00, 5.2164e+00, 5.1635e+00, 5.3191e+00, 4.2879e+00, 4.6119e+00,
        4.8052e+00, 5.3459e+00, 5.3161e+00, 4.6833e+00, 6.9087e+00, 4.6230e+00,
        6.6663e+00, 3.9963e+00, 3.6419e+00, 6.5071e+00, 3.1825e+00, 6.2579e+00,
        5.7861e+00, 4.1202e+00, 6.0080e+

In [77]:
x[92]

tensor([4.0124e+00, 4.1063e+00, 4.2540e+00, 3.1399e+00, 4.5659e+00, 3.7391e+00,
        5.1167e+00, 4.2183e+00, 3.9941e+00, 4.0391e+00, 3.7449e+00, 3.8253e+00,
        4.3522e+00, 4.9452e+00, 3.8664e+00, 5.1861e-01, 2.5564e+00, 3.5367e+00,
        3.6317e+00, 4.3446e+00, 4.7764e+00, 4.5923e+00, 3.8808e+00, 3.7931e+00,
        3.9228e+00, 3.4028e+00, 4.3385e+00, 5.1962e+00, 3.8609e+00, 3.4550e+00,
        4.2699e+00, 4.1593e+00, 3.8741e+00, 3.1816e+00, 3.3466e+00, 4.1650e+00,
        3.1915e+00, 4.0601e+00, 4.3805e+00, 3.8387e+00, 4.6269e+00, 4.2284e+00,
        4.3432e+00, 3.9235e+00, 4.8593e+00, 3.7482e+00, 3.9754e+00, 3.9658e+00,
        2.4308e+00, 3.1205e+00, 3.3779e+00, 3.9723e+00, 3.5346e+00, 4.3675e+00,
        3.6935e+00, 4.2207e+00, 2.8819e+00, 3.0272e+00, 2.8649e+00, 3.3688e+00,
        3.6996e+00, 3.8036e+00, 2.8774e+00, 2.9437e+00, 3.5333e+00, 2.5526e+00,
        3.3414e+00, 3.4850e+00, 2.7428e+00, 4.4995e+00, 2.7847e+00, 4.1063e+00,
        3.2932e+00, 3.4955e+00, 3.3841e+

In [51]:
#Real loop
for epoch in range(epochs):
    permutation = torch.randperm(len(train_pos)).numpy()
    nan_break = False
    
    for i in range(0,len(train), batch_size):
        indices = permutation[i:i+batch_size]
        batch = train_pos[indices]

        model.train(); opt.zero_grad()

        l_batch, r_batch, neg_batch = model(batch)
        
        # nan tests...
        l_test = np.isnan(np.mean(l_batch.detach().cpu().numpy()))
        r_test = np.isnan(np.mean(r_batch.detach().cpu().numpy()))
        neg_test = np.isnan(np.mean(neg_batch.detach().cpu().numpy()))
        if l_test or r_test or neg_test:
            nan_break = True
            break
        
        positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1))
        negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

        star_loss = torch.sum(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))
        
        # Now add in clustering loss for DOT categories
        idx = np.random.choice(len(dot_positions),5000)
        dot_sample = dot_positions[idx]
        dot_y_sample = dot_labs[idx].reset_index(drop=True)
        
        dot_emb = torch.empty([len(dot_sample),100],requires_grad=True).to(device)
        for j,doc in enumerate(dot_sample):
            doc_flat = torch.cat([torch.unsqueeze(z,0) for z in doc],1).squeeze(0).to(device)
            dot_emb[j] = model.embed_doc(doc_flat)

        separation_loss = davies_bouldin_torch(dot_emb,dot_y_sample)
        if separation_loss > 10:
            print('sep loss issue: %s' % str(separation_loss))
            nan_break = True
            break
        
        #Combine losses
        loss = star_loss + separation_loss

        loss.backward();opt.step()

        losses.append(star_loss.detach().cpu().numpy())
        separation_losses.append(separation_loss.detach().cpu().numpy())

        if i % (print_every*batch_size) == 0:
            print('separation avg loss: %s' % str(np.mean(separation_losses[-10:])))
            print('star avg loss: %s' % str(np.mean(losses[-10:])))
        if i % (log_every*batch_size) == 0:
            log.info('separation avg loss: %s' % str(np.mean(separation_losses[-10:])))
            log.info('star avg loss: %s' % str(np.mean(losses[-10:])))
    
    # End of inner loop
    if nan_break:
        print("you've got nans")
        log.warning("you've got nans")
        break
    
    print('Finished epoch %s at %s.' % (epoch,time.ctime()))
    log.info("Finished epoch %s" % str(epoch))
    
    epoch_loss = np.mean(losses[(len(losses)-100):])
    
    if epoch_loss < min(epoch_losses):
        print('best epoch so far!')
        log.info('best epoch so far!')
        
        weights = model.embeddings.weight
        with open(OUTPUT_WEIGHTS + 'weights_best_epoch', 'wb') as f:
            pickle.dump(weights.data.detach().cpu().numpy(), f)
    
    epoch_losses.append(epoch_loss)
    print(epoch_losses)

separation avg loss: 7.000402
star avg loss: 51795970.0
separation avg loss: 6.9682403
star avg loss: 46705896.0
separation avg loss: 6.9587893
star avg loss: 40460550.0
separation avg loss: 6.9912186
star avg loss: 41930256.0
separation avg loss: 6.9734917
star avg loss: 39674980.0
separation avg loss: 6.9584804
star avg loss: 37835556.0
separation avg loss: 6.9364424
star avg loss: 36274220.0
separation avg loss: 6.998781
star avg loss: 35051616.0
separation avg loss: 7.0450826
star avg loss: 33512200.0
separation avg loss: 7.0136743
star avg loss: 32475740.0
separation avg loss: 6.98145
star avg loss: 28773114.0
separation avg loss: 6.9661016
star avg loss: 26405724.0
separation avg loss: 6.9493346
star avg loss: 25643678.0
separation avg loss: 6.905468
star avg loss: 22514270.0
separation avg loss: 6.8607855
star avg loss: 21299674.0
separation avg loss: 6.8437386
star avg loss: 20470024.0
separation avg loss: 6.8279486
star avg loss: 19032980.0
separation avg loss: nan
star avg lo

In [None]:
weights = model.embeddings.weight
with open(OUTPUT_WEIGHTS + 'weights_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(weights.data.detach().cpu().numpy(), f)

In [None]:
print('You made it!')
log.info('You made it!')

In [None]:
# #Save the weights to CSV
# weights = model.input_embedding.weight
# weights = weights.data.detach().numpy()
# np.savetxt("weights_%s.csv" % SAMPLE_SIZE, weights, delimiter=",")