# Siamese Recurrent Neural Network

The general architecture of the model is based on [this tutorial](https://medium.com/mlreview/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07).

The data we'll be using is a sample of jobs corresponding to the same group according to the [Standard Occupational Classification](https://www.bls.gov/soc/) . The data is in a tsv file and can be downloaded [here](https://www.onetcenter.org/dl_files/database/db_20_1_text/Sample%20of%20Reported%20Titles.txt).

https://sorenbouma.github.io/blog/oneshot/
https://deepmind.com/research/publications/one-shot-learning-memory-augmented-neural-networks/
https://medium.com/mlreview/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07

In [15]:
from io import StringIO
import requests
import pandas as pd

file_url = 'https://www.onetcenter.org/dl_files/database/db_20_1_text/Sample%20of%20Reported%20Titles.txt'
csv = StringIO(requests.get(file_url).text)
df = pd.read_csv(csv, sep='\t').head()
df.head()

Unnamed: 0,O*NET-SOC Code,Reported Job Title,Shown in My Next Move
0,11-1011.00,Chief Diversity Officer (CDO),N
1,11-1011.00,Chief Executive Officer (CEO),Y
2,11-1011.00,Chief Financial Officer (CFO),Y
3,11-1011.00,Chief Nursing Officer,N
4,11-1011.00,Chief Operating Officer (COO),N


## Build dataset

Let's create positive samples with pairs of job titles corresponding to the same SOC, and negative examples with pairs of job titles sampled from different SOC codes.

In [2]:
import itertools
jobs_left = []
jobs_right = []
target = []

soc_codes = df['O*NET-SOC Code'].unique()
for code in soc_codes:
    similar_jobs = df[df['O*NET-SOC Code'] == code]['Reported Job Title']
    positive_pairs = list(itertools.combinations(similar_jobs, 2))
    jobs_left.extend([p[0] for p in positive_pairs])
    jobs_right.extend([p[1] for p in positive_pairs])
    target.extend([1.]*len(positive_pairs))
    
    other_jobs = df[df['O*NET-SOC Code'] != code]['Reported Job Title']
    for i in range(len(positive_pairs)):
        jobs_left.append(np.random.choice(similar_jobs))
        jobs_right.append(np.random.choice(other_jobs))
        target.append(0.)

dataset = pd.DataFrame({
        'job_left': jobs_left,
        'job_right': jobs_right,
        'target': target
    }).sample(frac=1)  # Shuffle dataset

dataset.sample(5)

Unnamed: 0,job_left,job_right,target
33164,Industrial Rehabilitation Consultant,Staff Occupational Therapist,1.0
12708,Manufacturing Director,Top Coater,0.0
41398,Lifeguard,Water Safety Instructor (WSI),1.0
70360,Issuing Operator,Stock Preparation Operator (Stock Prep Operator),1.0
21491,Certified Shorthand Reporter (CSR),Deposition Reporter,1.0


In [3]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(dataset)

## Modelling

In [5]:
import re
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from zeugma import TextsToSequences, Padder, ItemSelector, EmbeddingTransformer

maxlen = 7
vocab_size = 10000

def preprocess_job_titles(job_titles):
    """ Return a list of clean job titles """
    def preprocess_job_title(raw_job_title):
        """ Clean a single job title"""
        job_title = re.sub(r'\(.*\)', '', raw_job_title)
        return job_title.lower().strip()
    return [preprocess_job_title(jt) for jt in job_titles]
    
pipeline = make_pipeline(
    FunctionTransformer(preprocess_job_titles, validate=False),
    TextsToSequences(num_words=vocab_size), 
    Padder(max_length=maxlen),
)

pipeline.fit(list(df_train['job_left'])+list(df_train['job_right']))

Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function preprocess_job_titles at 0x117690ae8>,
          inv_kw_args=None, inverse_func=None, kw_args=None,
          pass_y='deprecated', validate=False)), ('textstosequences', TextsToSequences()), ('padder', Padder(max_length=7))])

In [8]:
X_left_train = pipeline.transform(df_train['job_left'])
X_right_train = pipeline.transform(df_train['job_right'])
seq_train = [X_left_train, X_right_train]

y_train = df_train['target'].values

In [9]:
import numpy as np
from keras.models import Model
from keras.layers import concatenate, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout

glove = EmbeddingTransformer('glove')
EMBEDDING_DIM = glove.model.get('the').shape[0]

word_index = pipeline.get_params()['textstosequences'].word_index

def create_embedding_matrix(vocab_size, word_index, embedding_dim=EMBEDDING_DIM):
    """ Prepare embedding matrix """
    num_words = min(vocab_size, len(word_index))
    embedding_matrix = np.zeros((num_words+1, embedding_dim))
    for word, i in word_index.items():
        if i >= vocab_size:
            continue
        embedding_vector = glove.model.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_matrix = create_embedding_matrix(vocab_size, word_index)

In [12]:
from keras.layers import LSTM, Bidirectional
from keras import Model, Sequential
from keras.layers import Input, Dense, Dropout, Lambda, Subtract, Merge
from keras import backend as K

def exponent_neg_manhattan_distance(arms_difference):
    return K.exp(-K.sum(K.abs(arms_difference), axis=1, keepdims=True))

def siamese_lstm(maxlen=maxlen):
    """ Define, compile and return a siamese LSTM model """
    input_shape = (maxlen,)
    left_input = Input(input_shape, name='left_input')
    right_input = Input(input_shape, name='right_input')

    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(len(embedding_matrix),
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=maxlen,
                                trainable=False,
                                name='embeddings')

    seq = Sequential()
    seq.add(embedding_layer)
    seq.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2,
                               return_sequences=True)))
    seq.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2,)))
    
    left_output = seq(left_input)
    right_output = seq(right_input)

    subtracted = Subtract(name='subtract')([left_output, right_output])
    malstm_distance = Lambda(exponent_neg_manhattan_distance, 
                             name='masltsm_distance')(subtracted)

    siamese_net = Model(inputs=[left_input, right_input], outputs=malstm_distance)

    siamese_net.compile(loss="binary_crossentropy", optimizer='adam',
                        metrics=['accuracy'])
    return siamese_net

siamese_lstm = siamese_lstm()

siamese_lstm.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
left_input (InputLayer)         (None, 7)            0                                            
__________________________________________________________________________________________________
right_input (InputLayer)        (None, 7)            0                                            
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 128)          1176396     left_input[0][0]                 
                                                                 right_input[0][0]                
__________________________________________________________________________________________________
subtract (Subtract)             (None, 128)          0           sequential_3[1][0]               
          

In [13]:
siamese_lstm.fit(seq_train, y_train, validation_split=0.2, epochs=10,)

Train on 46272 samples, validate on 11569 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x124f12ef0>

In [15]:
X_left_val = pipeline.transform(df_val['job_left'])
X_right_val = pipeline.transform(df_val['job_right'])
seq_val = [X_left_val, X_right_val]

y_val = df_val['target'].values
y_prob = siamese_lstm.predict(seq_val)
y_pred = np.round(y_prob)

In [16]:
from sklearn.metrics import accuracy_score, roc_auc_score
roc_auc_score(y_val, y_prob)

0.9411525077440928