In [1]:
import time
import pickle

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

#from scipy.sparse import vstack 

from validation.data import *
from validation.scoring import bubbleup_score, BubbleUpMixin

from src.model import StarSpace

import torch
import torch.nn as nn
from torch import optim

In [2]:
SAMPLE_SIZE = 100000 #500000
SOC_LEVEL = 6
BUBBLE_UP = 3
PROD = False

In [3]:
X_train, y_train = dot_train_data(SOC_LEVEL)
X_test, y_test, ids = indeed_test_data('../data/us/everything.csv', SAMPLE_SIZE, SOC_LEVEL, use_gcs=True)
if PROD == False:
    noprod_idx = get_soc_n(y_train.astype(str), 2) != 51
    X_train, y_train = X_train[noprod_idx], y_train[noprod_idx]

In [4]:
X_train = X_train.reset_index(drop=True)

X_train.head()

0    performs as nonspeaking member of scene in sta...
1    directs and coordinates through subordinate su...
2    circumcises jewish male infants in accordance ...
3    researches plans designs and administers build...
4    designs and oversees construction and repair o...
dtype: object

In [5]:
# path = 'lmd-classify-dot/ss-models/bcohen/weights_e5'
# fs = GCSFileSystem(project='labor-market-data')
# with fs.open(path) as f:
#     embeddings = pickle.load(f)

with open('data/separation/weights_10000', 'rb') as f:
    embeddings = pickle.load(f)

print(embeddings.shape)
embeddings = torch.FloatTensor(embeddings)
embeddings = nn.Embedding.from_pretrained(embeddings)

(8881, 100)


In [6]:
# path = 'lmd-classify-dot/ss-models/bcohen/train_vocab_e5'
# fs = GCSFileSystem(project='labor-market-data')
# with fs.open(path) as f:
#     vocab = pickle.load(f)

In [7]:
with open('data/separation/train_vocab_10000', 'rb') as f:
    vocab = pickle.load(f)

In [8]:
model = StarSpace(
    d_embed=100,
    vocabulary=vocab,
    k_neg = 10,
    input_embedding = embeddings)

In [9]:
X_train = model.get_positions(X_train)
X_test = model.get_positions(X_test)

In [10]:
X_train_emb = np.empty([X_train.shape[0],100])
X_test_emb = np.empty([X_test.shape[0],100])

start_time = time.time()
for i,doc in enumerate(X_train):
    X_train_emb[i] = model.embed_doc(torch.cat(doc))
print(time.time() - start_time)

start_time = time.time()
for i,doc in enumerate(X_test):
    X_test_emb[i] = model.embed_doc(torch.cat(doc))
print(time.time() - start_time)

1.7283744812011719
2.638395071029663


In [11]:
class BubbleUpLogisticRegression(BubbleUpMixin, LogisticRegression):
    pass

In [12]:
Bubbles = BubbleUpLogisticRegression(C=2., solver='lbfgs', class_weight='balanced', 
                                     multi_class="multinomial", n_jobs=-1).set_bubbles(BUBBLE_UP)

Bubbles.fit(X_train_emb,y_train)

BubbleUpLogisticRegression(C=2.0, class_weight='balanced', dual=False,
                           fit_intercept=True, intercept_scaling=1,
                           l1_ratio=None, max_iter=100,
                           multi_class='multinomial', n_jobs=-1, penalty='l2',
                           random_state=None, solver='lbfgs', tol=0.0001,
                           verbose=0, warm_start=False)

In [13]:
y_pred = Bubbles.predict(X_test_emb)

In [14]:
accuracy_score(get_soc_n(y_test.astype(str), BUBBLE_UP).astype(str), y_pred) 

0.1013003261986684

## separation loss

In [15]:
from sklearn.metrics import davies_bouldin_score

davies_bouldin_score(X_train_emb,y_train)

7.2641951678592624