In [None]:

! pip install --quiet fuzzywuzzy
! pip install --quiet nltk
! pip install --quiet diskcache
! pip install --quiet python-Levenshtein
! pip install --quiet lightgbm
! pip install --quiet lime
! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [None]:
! conda install -c numba --yes numba

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
import attr
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from scipy.sparse import vstack 

from validation.data import indeed_test_data, dot_train_data, get_soc_n
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from validation.dot_data import LemmaTokenizer, get_dictionary
from classification.embedding import PreEmbeddedVectorizer, Embedding, WordEmbeddingVectorizer

pd.set_option('max_colwidth',50)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from itertools import islice
from embed_software.preprocess import claims_processor, readme_processor, Preprocessor
import json
from toolz import dissoc

string_processor = Preprocessor(readme_processor, 4).process

def virginia_data(path):
    with open(path) as f:
        dat = (json.loads(l) for l in f)
        dat = (dissoc(d, '@context', '@type', 'jobLocation', 'baseSalary', '_id') for d in dat)
        dat = list(dat)
    return dat

def virginia_test_data(path, N):
    dat = virginia_data(path)
    df = pd.DataFrame(dat)
    descriptions = df.description.map(string_processor)
    codes = df.onet_soc_code
    idx = codes != ''
    codes = get_soc_n(codes[idx], N)
    return descriptions[idx], codes, df


In [4]:
SOC_LEVEL = 6

In [None]:
X_train, y_train = dot_train_data(SOC_LEVEL)
X_test, y_test, va_df = virginia_test_data('../data/va_job_posts.json', SOC_LEVEL)
# X_test, y_test = matches.description, get_soc_n(matches.code, 6)
X_test = X_test.map(string_processor)

In [13]:
va_df['title'] = va_df.title.str.lower()

In [15]:
from validation.data import make_matcher

d = va_df[va_df.onet_soc_code.notna()].reset_index()
matcher = make_matcher()
matches = matcher(d).set_index('index')

In [None]:
# matches[['title', 'assigned_title', 'code', 'onet_soc_code', 'occupationalCategory']].head(20).loc[352]

In [None]:
model = Pipeline([('glove_100_va', WordEmbeddingVectorizer('../glove-models/glove-va-100.txt')),
                  ('lr', LogisticRegression(C=5., solver='newton-cg', class_weight='balanced', multi_class="multinomial", n_jobs=-1))])

model.fit(X_train, y_train)

In [None]:
model = Pipeline([('sentencespace_100_va', PreEmbeddedVectorizer('../ss-models/va-ss-100', cache_dir='va_embed_cache')),
                  ('lr', LogisticRegression(C=5., solver='newton-cg', class_weight='balanced', multi_class="multinomial", n_jobs=-1))])

model.fit(X_train, y_train)

In [10]:
def _get_soc_n(df, n):
    return (df.T
            .reset_index()
            .pipe(lambda df: df.assign(soc = df['index'].map(lambda i: str(i)[0:n])))
            .set_index('soc')
            .drop('index', 1)
            .groupby('soc').sum().T)


def get_pred(model, X):
    vals = model.predict_proba(X)
    df = pd.DataFrame(vals)
    df.columns = model.classes_
    n=3
    return _get_soc_n(df, n)

class UpscaleModel(LogisticRegression):
    def predict_soc_n(self, X, n):
        preds = self.predict_proba(X)
        df = pd.DataFrame(preds)
        df.columns = labels
        return self._get_soc_n(df, n)
    
    
def make_title_lookup(path, N):
    dot_codes = get_dictionary('', N).groupby('soc').first()
    d = dot_codes[f'desc_soc{N}'].to_dict()
    def lookup(code):
        try:
            return d[int(code)]
        except KeyError:
            return code
    return lookup

In [21]:
from validation.data import get_title_lookup
from validation.scoring import bubbleup_score


title_lookup = get_title_lookup('crosswalks')
title_lookup['code'] = get_soc_n(title_lookup.code, 6)

In [30]:
! pip install --quiet strsim

In [58]:
from similarity.qgram import QGram

qgram = QGram(1)

In [None]:
lookup

In [155]:
from similarity.metric_lcs import MetricLCS

def string_match_title(title_lookup, title, codes):
    lookup = title_lookup[title_lookup.code.isin(codes)].reset_index(drop=True)

    lcs = MetricLCS()
    lookup['distance'] = [lcs.distance(x.split(), title.split()) for x in lookup.title]
    top_guesses = lookup.sort_values('distance').code[:5]
    return top_guesses

string_match_title(title_lookup, 'chief officer', [112011, 111011])

1     111011
2     111011
18    111011
50    111011
17    111011
Name: code, dtype: int64

In [150]:
title_lookup

Unnamed: 0,title,code
0,ceo,111011
1,chief executive officer,111011
2,chief operating officer,111011
3,commissioner of internal revenue,111011
4,coo,111011
5,county commissioner,111011
6,government service executive,111011
7,governor,111011
8,mayor,111011
9,department store general manager,111021


In [None]:
idx = X_test.notna()

bubbleup_score(y_train, X_test[idx], y_test[idx], model)

In [21]:
# GloVe vectors on VA

idx = X_test.notna()

bubbleup_score(y_train, X_test[idx], y_test[idx], model)

0.20506294803666322

In [75]:
idx = X_test.notna()

bubbleup_score(y_train, X_test[idx], y_test[idx], model)

0.43231615726227796

In [74]:
yidx = (matches[idx].onet_soc_code != '')
y_test_va = get_soc_n(matches[idx][yidx].onet_soc_code, 6)

bubbleup_score(y_train, X_test[idx][yidx], y_test_va, model)

0.39304142709715467

In [77]:
# VA!
# X_test, y_test, va_df = virginia_test_data('../data/va_job_posts.json', SOC_LEVEL)

# df = pd.DataFrame({'X': X_test, 'y': y_test})
# sample = df[df.X.notna()].sample(50000)

# preds = model.predict_proba(sample.X)

labels = np.unique(y_train)
df = pd.DataFrame(preds)
df.columns = labels

accuracy_score(get_soc_n_preds(df, 3).values, sample.y.astype(str).map(lambda s: s[0:3]))

# preds = get_top_soc_n_preds(df, 3, 1)
# istop = [y in preds[i] for i,y in enumerate(sample.y.astype(str).map(lambda s: s[0:3]))]
# np.array(istop).mean()    

0.2956

In [13]:
# UK - 6/3
df = pd.DataFrame(preds[0])
df.columns = labels
# np.save('ss_models/sentencespace_100_uk/predictions-63', get_soc_n(df, 3).values)
accuracy_score(get_soc_n(df, 3).values, y_test.astype(str).map(lambda s: s[0:3]))

0.41385476727667753

In [None]:
# US - 6/3
df = pd.DataFrame(preds[0])
df.columns = labels
# np.save('ss_models/sentencespace_100_india/predictions-63', get_soc_n(df, 3).values)
accuracy_score(get_soc_n(df, 3).values, y_test.astype(str).map(lambda s: s[0:3]))

0.48249329065506391

In [None]:
# India - predicting at 6, aggregating to 3
# np.save('ss_models/sentencespace_100_india/predictions-63', get_soc_n(df, 3).values)
accuracy_score(get_soc_n(df, 3).values, y_test.astype(str).map(lambda s: s[0:3]))

0.43210204120311579

In [9]:
# India - Sentencespace 100
[get_accuracy(p, y_test) for p in preds]

[0.40573887004150638]

In [16]:
# US - Sentencespace 100
[get_accuracy(p, y_test) for p in preds]

[0.42613479319168668]

In [139]:
# India
[get_accuracy(p, y_test) for p in preds]

[0.23045040528118993, 0.02092420823932481, 0.023130274922704103]

In [94]:
# UK 
[get_accuracy(p, y_test) for p in preds]

[0.23045040528118993, 0.02092420823932481, 0.023130274922704103]

In [None]:
# US
[get_accuracy(p, y_test) for p in preds]

[0.30347637686457379, 0.36610287365032945, 0.025223570530595506]

In [None]:
# OLD - SOC2?
[accuracy_score(p, y_test) for p in preds]

[0.49040702886856735, 0.58729304883151034, 0.57982188751419517]

In [46]:
p = pd.DataFrame(preds).T.assign(y = y_test.values)

differ = p[p[0] != p[1]]

In [None]:
differ[differ[0] == differ['y']].y.value_counts()

In [None]:
print(classification_report(preds[0], y_test))

In [None]:
print(classification_report(preds[1], y_test))

# Confusion Matrices

In [57]:
def print_confusion_matrices(models, preds, y, SOC_LEVEL):
    dot_dict = get_dictionary('', SOC_LEVEL)
    model_names = ['-'.join(m.named_steps.keys()) for m in models]
    un = dot_dict.groupby('soc').apply(lambda df: df.head(1))
    category_names = un['desc_soc{}'.format(SOC_LEVEL)]
    for name,p in zip(model_names, preds):
        df = pd.DataFrame(confusion_matrix(y, p, un.soc), 
                          index=category_names, 
                          columns=category_names)
        filename = 'confusion-matrices/{}.csv'.format(name)
        df.to_csv(filename, index=False)

In [58]:
print_confusion_matrices([model], [preds], y_train, 3)

In [60]:
accuracy_score(preds, y_train)

0.38563508532846036

0.38563508532846036