In [1]:
#  ! pip install --quiet seaborn toolz fuzzywuzzy
#  ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [2]:
import time
import sys
import logging
import math
import random
import json
import pickle
from collections import Counter

import numpy as np
import pandas as pd

from gcsfs import GCSFileSystem
from sklearn.feature_extraction.text import CountVectorizer

import torch
import torch.nn as nn
from torch import optim
#import torchtext

from validation.data import *

In [3]:
from src.model import StarSpace

In [4]:
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", 
                          datefmt="%Y-%m-%d - %H:%M:%S")
fh = logging.FileHandler("bcohen1.log", "w")
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [5]:
SAMPLE_SIZE = 200000
SOC_LEVEL = 3
BUBBLE_UP = 2
TRAIN_VALIDATE = False

In [6]:
log.info('Pulling Indeed data for sample size %s' % SAMPLE_SIZE)

In [7]:
indeed = get_indeed_texts('../data/us/everything.csv',use_gcs=True,nrows=SAMPLE_SIZE)
indeed = indeed['content']

In [8]:
if TRAIN_VALIDATE:
    idx = random.sample(range(len(indeed)),int(.8*len(indeed)))

    train = indeed[idx]
    train.reset_index(drop=True,inplace=True)

    valid = indeed.drop(idx)
    valid.reset_index(drop=True,inplace=True)

else:
    train = indeed.reset_index(drop=True)

In [9]:
log.info('About to train vocab')

In [10]:
Vectorizer = CountVectorizer(min_df = 5,
                             max_df = .95)
Vectorizer.fit(train)

train_vocab = Vectorizer.vocabulary_

# #Save the file
with open('data/train_vocab_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(train_vocab, f)

In [11]:
# with open("data/train_vocab_%s" % SAMPLE_SIZE,"rb") as f:
#     train_vocab = pickle.load(f)

In [12]:
log.info('Trained Vocab')

In [13]:
model = StarSpace(
    d_embed=100,
    vocabulary=train_vocab,
    k_neg = 10,
    max_norm=20)

lr = .01
opt = torch.optim.Adam(model.parameters(), lr=lr)

In [14]:
for p in model.parameters():
    print(p.shape)

torch.Size([49307, 100])


In [15]:
epochs = 3
print_every = 100
log_every = 1
batch_size = 100

losses = []
epoch_losses = [1e10]
log.info('Beginning run')

In [16]:
# l_test, r_test, neg_test

# neg_batch[neg_batch!=neg_batch] = 0

# neg_batch[59][7]

In [17]:
for epoch in range(epochs):
    permutation = torch.randperm(len(train)).numpy()
    nan_break = False
    
    for i in range(0,len(train), batch_size):
        indices = permutation[i:i+batch_size]
        batch = train[indices]

        model.train(); opt.zero_grad()

        lhs = batch.values

        l_batch, r_batch, neg_batch = model(lhs)
        # nan tests...
        l_test = np.isnan(np.mean(l_batch.detach().numpy()))
        r_test = np.isnan(np.mean(r_batch.detach().numpy()))
        neg_test = np.isnan(np.mean(neg_batch.detach().numpy()))
        
        if l_test or r_test or neg_test:
            nan_break = True
            break
        
        positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1)) #this is the same as dot product by row

        negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

        loss = torch.sum(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))
        losses.append(loss.detach().numpy())

        loss.backward(); opt.step()

        accuracy_check = positive_similarity.squeeze(1) > negative_similarity[:,0].unsqueeze(1)
        acc = np.mean(accuracy_check.detach().numpy())

        if i % (print_every*batch_size) == 0:
            print('batch avg loss: %s' % str(np.mean(losses[(len(losses)-20):])))
        if i % (log_every*batch_size) == 0:
            log.info('batch avg loss: %s' % str(np.mean(losses[(len(losses)-20):])))
            
    if nan_break:
        print("you've got nans")
        log.warning("you've got nans")
        break
    
    print('Finished epoch %s at %s.' % (epoch,time.ctime()))
    log.info("Finished epoch %s" % str(epoch))
    
    epoch_loss = np.mean(losses[(len(losses)-100):])
    
    if epoch_loss < min(epoch_losses):
        print('best epoch so far!')
        log.info('best epoch so far!')
        
        weights = model.embeddings.weight
        with open('data/weights_best_epoch', 'wb') as f:
            pickle.dump(weights.data.detach().numpy(), f)
    
    epoch_losses.append(epoch_loss)
    print(epoch_losses)

batch avg loss: 17143512.0
batch avg loss: 10111545.0
batch avg loss: 8656699.0
batch avg loss: 8669787.0
batch avg loss: 7253537.0
batch avg loss: 6464210.5
batch avg loss: 6214781.0
batch avg loss: 5894807.5
batch avg loss: 5869583.0
batch avg loss: 5125495.0
batch avg loss: 4824659.0
batch avg loss: 5924941.5
batch avg loss: 4425893.5
batch avg loss: 4464367.0
batch avg loss: 4336255.0
batch avg loss: 4013234.0
batch avg loss: 3959154.8
batch avg loss: 3991960.8
batch avg loss: 3812307.5
batch avg loss: 3831147.2
Finished epoch 0 at Sat Jun 13 17:32:31 2020.
best epoch so far!
[10000000000.0, 3543451.8]
batch avg loss: 3792986.0
batch avg loss: 3546350.8
batch avg loss: 3253846.0
batch avg loss: 3155780.8
batch avg loss: 3058593.2
batch avg loss: 2801615.5
batch avg loss: 3168978.5
batch avg loss: 2783515.5
batch avg loss: 2805510.0
batch avg loss: 2682542.5
batch avg loss: 2716440.5
batch avg loss: 2435217.5
batch avg loss: 2459549.8
batch avg loss: 2394428.5
batch avg loss: 242601

In [18]:
weights = model.embeddings.weight
with open('data/weights_%s' % SAMPLE_SIZE, 'wb') as f:
    pickle.dump(weights.data.detach().numpy(), f)

In [19]:
print('You made it!')
log.info('You made it!')

You made it!


In [None]:
# #Save the weights to CSV
# weights = model.input_embedding.weight
# weights = weights.data.detach().numpy()
# np.savetxt("weights_%s.csv" % SAMPLE_SIZE, weights, delimiter=",")

### Old stuff

In [None]:
#Validate

In [None]:
# model.eval()
# val_acc = 0

In [None]:
# for j in range(0,len(valid), batch_size):
#     valperm = torch.randperm(len(valid)).numpy()
#     val_indices = valperm[j:j+batch_size]
#     val_batch = valid[val_indices]

#     val_lhs = val_batch.values

#     val_l_batch, val_r_batch, val_neg_batch = model(val_lhs)

#     val_positive_similarity = torch.bmm(val_l_batch,val_r_batch.transpose(2,1))
#     val_negative_similarity = torch.bmm(val_l_batch, val_neg_batch.transpose(2,1)).squeeze(1)

#     val_loss = torch.sum(torch.clamp(.1 - val_positive_similarity + val_negative_similarity, min=0))
#     if j % (print_every*batch_size) == 0:
#         print(val_loss)
    
#     val_accuracy_check = val_positive_similarity.squeeze(1) > val_negative_similarity[:,0].unsqueeze(1)
#     val_acc += np.sum(val_accuracy_check.detach().numpy())

In [None]:
# print(val_acc/len(valid))

In [None]:
# from collections import Counter

# def build_vocab(train, min_ct = 2):
#     ''' build vocabulary for an array/list/series of text '''
#     # To do: smaller groups before aggregating to improve performance
#     def wordcount_df(doc):
#         tok = doc.split()
#         d = pd.DataFrame.from_dict(Counter(tok),orient='index').reset_index().rename(columns={'index':'word'})
#         return d

#     d_list = [wordcount_df(x) for x in train]

#     d = pd.concat(d_list,axis=0)

#     d = d.groupby(['word'])[0].sum().sort_values(ascending=False)
#     d = d[d >= min_ct]
    
#     d = dict(zip(d.index.values, range(len(d))))
    
#     return d

### Begin the training loop

In [None]:
# batch = train[0:100]
# validation = train[100:150]
# batch_size = 100

# model.train()
# opt.zero_grad()

# lhs = batch.values

# l_batch, r_batch, neg_batch = model(lhs)

# positive_similarity = torch.bmm(l_batch,r_batch.transpose(2,1)) #this is the same as dot product by row

# negative_similarity = torch.bmm(l_batch, neg_batch.transpose(2,1)).squeeze(1)

# loss = torch.mean(torch.clamp(.1 - positive_similarity + negative_similarity, min=0))
# loss

# loss.backward(); opt.step()

### Calculate accuracy of predictions in current batch

In [None]:
# accuracy_check = positive_similarity.squeeze(1) > negative_similarity[:,0].unsqueeze(1)
# acc = np.mean(similarity_check.detach().numpy())
# print(acc)

In [None]:
# def embed_doc(d,vocab,embedding,normalize=False):
#     positions = []
#     for t in d:
#         try:
#             positions.append(vocab[t])
#         except KeyError:
#             pass
#     output = torch.sum(embedding(torch.LongTensor(positions)),dim=0)
#     if normalize:
#         output = output / output.norm()
#     return output

In [None]:
# # similarity
# l_batch = []
# r_batch = []
# neg_batch = []

# for i in range(len(batch)):
#     #Positive similarity
#     s = batch.values[i].split('\t') #sentences
#     if type(s) == str: #only one sentence in s
#         a = s
#         b = s
#     else:
#         a, b = np.random.choice(s, 2, False)
    
#     a = a.split()
#     b = b.split()
    
#     a_emb = embed_doc(a,train_vocab,input_embedding,normalize=True)
#     b_emb = embed_doc(b,train_vocab,input_embedding,normalize=True)
    
#     l_batch.append(a_emb)
#     r_batch.append(b_emb)

#     #Negative similarity
#     negs = []
#     for _i in range(k * 3):
#         index = np.random.choice(len(batch))
#         if not index == i: #if it's not from the same document
#             c = batch.values[index].split('\t')
#             c = np.random.choice(c, 1)[0].split()
#             c_emb = embed_doc(c,train_vocab,input_embedding,normalize=True)
#             negs.append(c_emb)
#             if(len(negs) >= k):
#                 break
    
#     neg_batch.append(torch.stack(negs))