In [84]:
#  ! pip install --quiet seaborn toolz fuzzywuzzy
#  ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [86]:
import time
import math
import random
import json
from collections import Counter

import numpy as np
import pandas as pd

from gcsfs import GCSFileSystem

import torch
import torch.nn as nn
from torch import optim
#import torchtext

In [88]:
from src.model import InnerProductSimilarity, MarginRankingLoss, StarSpace

In [12]:
SAMPLE_SIZE = 100000
SOC_LEVEL = 2
BUBBLE_UP = 2

In [10]:
def get_indeed_texts(path, use_gcs = False, **kwargs):
    """Reads csv with indeed data that turns into test set"""
    if use_gcs:
        fs = GCSFileSystem(project='labor-market-data')
        path = path.replace('..','lmd-classify-dot',1)
        with fs.open(path) as f:
            indeed = pd.read_csv(f, **kwargs)
    else:
        indeed = pd.read_csv(path, **kwargs)

    indeed['title'] = indeed.title.str.lower()
    return indeed

def indeed_test_data(texts, lim, soc_n, use_gcs = False):
    """Make test data from indeed (pre-embedded)"""
    indeed = get_indeed_texts(texts, use_gcs, nrows=lim)
    matcher = make_matcher()
    matches = matcher(indeed.reset_index()).set_index('index')
    return matches.content, get_soc_n(matches.code, soc_n), matches.index

In [65]:
train = get_indeed_texts('../data/us/everything.csv',use_gcs=True,nrows=10000)
train = train['content']
train.head()

0    part time temporary do you have or know someon...
1    40 000 46 000 year lead electrician minimum ye...
2    front desk position chiropractic office monday...
3    110 000 130 000 year job title sec reporting m...
4    internship avakas is unique place where ideas ...
Name: content, dtype: object

In [81]:
from collections import Counter

def build_vocab(train, min_ct = 2):
    ''' build vocabulary for an array/list/series of text '''
    # To do: smaller groups before aggregating to improve performance
    def wordcount_df(doc):
        tok = doc.split()
        d = pd.DataFrame.from_dict(Counter(tok),orient='index').reset_index().rename(columns={'index':'word'})
        return d

    d_list = [wordcount_df(x) for x in train]

    d = pd.concat(d_list,axis=0)

    d = d.groupby(['word'])[0].sum().sort_values(ascending=False)
    d = d[d >= min_ct]
    
    return d

In [82]:
train_vocab = build_vocab(train)

In [72]:
class NegativeSampling():
    def __init__(self, n_output, n_negative=5, weights=None):
        super(NegativeSampling, self).__init__()
        self.n_output = n_output
        self.n_negative = n_negative
        self.weights = weights
        
    def sample(self, n_samples):
        if self.weights:
            samples = torch.multinomial(self.weights, n_samples, replacement=True)
        else:
            samples = torch.Tensor(n_samples).uniform_(0, self.n_output - 1).round().long()
        return torch.autograd.Variable(samples)

In [None]:
# train_iter, val_iter = data.BucketIterator.splits(
#     (train, validation), batch_size=batch_size, device=gpu)

In [90]:
# TODO: implement loading from snapshot
model = StarSpace(
    d_embed=100,
    n_input=len(train_vocab),
    n_output=len(train_vocab),
    similarity=InnerProductSimilarity(),
    max_norm=20,
    aggregate=torch.sum)