# Package definitions

In [1]:
import nltk
import numpy as np
import pandas as pd
import json
from joblib import dump, load
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import re
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile

unable to import 'smart_open.gcs', disabling that module


# Load in word2vec model

In [24]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec 
model = Word2Vec.load("data/wemb_models/word2vec/word2vec.model")
wordvectors = model.wv

unable to import 'smart_open.gcs', disabling that module


# Load in data

In [2]:
qrels = pd.read_csv(r'data/queries/qrels.txt', sep='\t', header=None)
query_ids = qrels[0].to_numpy()
table_ids = qrels[2].to_numpy()
queries = pd.read_csv(r'data/queries/queries.txt', header=None)
queries = pd.DataFrame([row[0][row[0].find(' ') + 1:] for index, row in queries.iterrows()]).to_numpy()

# Raw word extraction

In [3]:
from nltk.corpus import stopwords
nlp = en_core_web_sm.load()
sw = stopwords.words("english")

In [4]:
def processString(string):
    sentence = nlp(string)
    wordlist = [token.text.lower() for token in sentence if not token.is_stop and not token.is_punct]
    wordlist = [word for word in wordlist if not word in sw]
    return wordlist

def processHeaders(headers):
    header_tokens = []
    for header in headers:
        match_obj = re.search(r'\[[^\|]*\|(.*)\]', header)
        if match_obj is not None:
            string = match_obj.group(1)
        else:
            string = header
        header_tokens = header_tokens + processString(string)
        
    return header_tokens

# Process each needed table such that we have the needed tokens per table

In [5]:
word_feat = []

for table_id in table_ids:
    table_file_id = table_id.split('-')[1]
    if int(table_id.split('-')[2]) >= 1000:
        table_file_id = str(int(table_file_id) - 1)
    tables = pd.read_json(r'data/tables/re_tables-' + str(table_file_id) + '.json')
    table = tables[table_id]
    stringslist = processString(table.pgTitle) + processString(table.secondTitle) + processString(table.caption)
    word_feat.append(stringslist)
    

dump(word_feat, 'table_tokens_noheaders.joblib') 

['table_tokens_noheaders.joblib']

# Process each needed query such that we have the needed tokens per table

In [47]:
word_feat = []

for query in queries:
    tokens = processString(query[0])
    word_feat.append(list(set(tokens)))
    
dump(word_feat, 'query_tokens.joblib') 

['query_tokens.joblib']

# Load example model

In [64]:
tmp_file = get_tmpfile("test.txt")
_ = glove2word2vec("data/wemb_models/glove.6B/glove.6B.50d.txt",tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

In [65]:
print(model.most_similar(positive=['house'], topn=10))

[('office', 0.7581614851951599), ('senate', 0.7204986214637756), ('room', 0.7149738669395447), ('houses', 0.6888046264648438), ('capitol', 0.6851759552955627), ('building', 0.684728741645813), ('home', 0.6720309853553772), ('clinton', 0.6707026958465576), ('congressional', 0.669257640838623), ('mansion', 0.665092408657074)]


# Convert every term for both queries and tables to semantic space

In [76]:
def projectTerms(gensim_model, terms):
    return [projectSubTerms(gensim_model,term_list) for term_list in terms]

def projectSubTerms(gensim_model, terms):
    return [gensim_model[term] for term in terms if term in gensim_model]          

query_terms = load('query_tokens.joblib')
semantic_query_terms = projectTerms(model,query_terms)

table_terms = load('table_tokens.joblib')
semantic_table_terms = projectTerms(model,table_terms)

['lending',
 'refinancing',
 'rates',
 'main',
 'date',
 'eurozone',
 'operations',
 'marginal',
 'facility',
 'deposit',
 'interest']

# Similarity metrics

In [None]:
def cos_early_sim(table_terms, query_terms):
    return cosine_similarity(np.mean(table_terms, axis = 0), np.mean(query_terms, axis = 0))
                             
def cos_late_max_sim(table_terms, query_terms):
    combs = []
    for table_term in table_terms:
        for query_term in query_terms:
            combs.append(cosine_similarity(table_term, query_term))
    return np.max(combs)
                             
def cos_late_min_sim(table_terms, query_terms):
    combs = []
    for table_term in table_terms:
        for query_term in query_terms:
            combs.append(cosine_similarity(table_term, query_term))
    return np.min(combs)
                             
def cos_late_avg_sim(table_terms, query_terms):
    combs = []
    for table_term in table_terms:
        for query_term in query_terms:
            combs.append(cosine_similarity(table_term, query_term))
    return np.avg(combs)