# This is an extra notebook used for the semantic matching to be able to process large word embedding files since this could not be done locally. This notebook is ran on google colab.


In [0]:
import gensim.downloader as api

model = api.load("glove-wiki-gigaword-300")
filename = "results_glove_300"

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
from joblib import load

def projectTerms(gensim_model, terms):
    return [projectSubTerms(gensim_model,term_list) for term_list in terms]

def projectSubTerms(gensim_model, terms):
    return [gensim_model[term] for term in terms if term in gensim_model]          

query_terms = load('query_tokens.joblib')
semantic_query_terms = projectTerms(model,query_terms)

table_terms = load('table_tokens_better.joblib')
table_terms_unique = [list(set(termslist)) for termslist in table_terms]
semantic_table_terms = projectTerms(model,table_terms_unique)

In [0]:
from gensim.summarization.bm25 import BM25

baseline = BM25(table_terms)
average_idf = sum(map(lambda k: float(baseline.idf[k]), baseline.idf.keys())) / len(baseline.idf.keys())

raw = pd.read_csv(r'qrels.txt', delimiter='\t', names=["query-id", "zeros", "table-id", "rel"])

res = {
    'query-id': list(),
    'q0': list(),
    'document-id': list(),
    'rank': list(),
    'score': list(),
    'name': list()
}

for index, row in raw.iterrows():
    res['query-id'].append(row["query-id"])
    res['q0'].append('Q0')
    res['document-id'].append(row["table-id"])
    res['rank'].append(0)
    res['score'].append(baseline.get_score(query_terms[row[0] - 1], index, average_idf))
    res['name'].append('STANDARD')

df_res = pd.DataFrame.from_dict(res)

df_res.to_csv('bm25.txt', sep=' ', index=False, header=False)

In [0]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

table_terms2 = load('table_tokens_better.joblib')
table_sents = [' '.join(terms) for terms in table_terms2]

cv = CountVectorizer()

# convert text data into term-frequency matrix
data = cv.fit_transform(table_sents)

tfidf_transformer = TfidfTransformer()

# convert term-frequency matrix into tf-idf
tfidf_matrix = tfidf_transformer.fit_transform(data)

# create dictionary to find a tfidf word each word
word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))

In [0]:
from scipy import spatial
import numpy as np

def cos_early_sim(table_terms, table_tokens, query_terms, query_tokens):
    query_terms_tfidf = [word2tfidf[query_token] if query_token in word2tfidf else 1 for query_token in query_tokens]
    
    table_terms_tfidf = [word2tfidf[table_token] if table_token in word2tfidf else 1 for table_token in table_tokens]

    query_terms_weighted = [np.multiply(q_vec, q_weight) for (q_vec, q_weight) in zip(query_terms,query_terms_tfidf)]
    table_terms_weighted = [np.multiply(t_vec, t_weight) for (t_vec, t_weight) in zip(table_terms,table_terms_tfidf)]

    query_sum = np.sum(query_terms_weighted, axis = 0)
    table_sum = np.sum(table_terms_weighted, axis = 0)
    
    return  (1 - (spatial.distance.cosine(query_sum,table_sum)))


def late_fusion(table_terms, query_terms):
    combs = []
    for table_term in table_terms:
        for query_term in query_terms:
            cossim = 1 - (spatial.distance.cosine(table_term, query_term))
            combs.append(cossim)
    return combs
                             
def cos_late_max_sim(table_terms, query_terms):
    combs = late_fusion(table_terms, query_terms)
    return max(combs) if len(combs) > 0 else 0 
                             
def cos_late_sum_sim(table_terms, query_terms):
    combs = late_fusion(table_terms, query_terms)
    return sum(combs) if len(combs) > 0 else 0 
                             
def cos_late_avg_sim(table_terms, query_terms):
    combs = late_fusion(table_terms, query_terms)
    return np.mean(combs) if len(combs) > 0 else 0 

# Late average

In [0]:
import pandas as pd


raw = pd.read_csv(r'qrels.txt', delimiter='\t', names=["query-id", "zeros", "table-id", "rel"])

res = {
    'query-id': list(),
    'q0': list(),
    'document-id': list(),
    'rank': list(),
    'score': list(),
    'name': list()
}

for index, row in raw.iterrows():
    res['query-id'].append(row["query-id"])
    res['q0'].append('Q0')
    res['document-id'].append(row["table-id"])
    res['rank'].append(0)
    res['score'].append(cos_late_avg_sim(semantic_table_terms[index], semantic_query_terms[row["query-id"] - 1]))
    res['name'].append('STANDARD')

df_res = pd.DataFrame.from_dict(res)

df_res.to_csv(filename + '_late_avg.txt', sep=' ', index=False, header=False)

# Late sum


In [0]:
import pandas as pd


raw = pd.read_csv(r'qrels.txt', delimiter='\t', names=["query-id", "zeros", "table-id", "rel"])

res = {
    'query-id': list(),
    'q0': list(),
    'document-id': list(),
    'rank': list(),
    'score': list(),
    'name': list()
}

for index, row in raw.iterrows():
    res['query-id'].append(row["query-id"])
    res['q0'].append('Q0')
    res['document-id'].append(row["table-id"])
    res['rank'].append(0)
    res['score'].append(cos_late_sum_sim(semantic_table_terms[index], semantic_query_terms[row["query-id"] - 1]))
    res['name'].append('STANDARD')

df_res = pd.DataFrame.from_dict(res)

df_res.to_csv(filename + '_late_sum.txt', sep=' ', index=False, header=False)

# Late max

In [0]:
import pandas as pd


raw = pd.read_csv(r'qrels.txt', delimiter='\t', names=["query-id", "zeros", "table-id", "rel"])

res = {
    'query-id': list(),
    'q0': list(),
    'document-id': list(),
    'rank': list(),
    'score': list(),
    'name': list()
}

for index, row in raw.iterrows():
    res['query-id'].append(row["query-id"])
    res['q0'].append('Q0')
    res['document-id'].append(row["table-id"])
    res['rank'].append(0)
    res['score'].append(cos_late_max_sim(semantic_table_terms[index], semantic_query_terms[row["query-id"] - 1]))
    res['name'].append('STANDARD')

df_res = pd.DataFrame.from_dict(res)

df_res.to_csv(filename + '_late_max.txt', sep=' ', index=False, header=False)

# Early

In [0]:
import pandas as pd


raw = pd.read_csv(r'qrels.txt', delimiter='\t', names=["query-id", "zeros", "table-id", "rel"])

res = {
    'query-id': list(),
    'q0': list(),
    'document-id': list(),
    'rank': list(),
    'score': list(),
    'name': list()
}

for index, row in raw.iterrows():
    res['query-id'].append(row["query-id"])
    res['q0'].append('Q0')
    res['document-id'].append(row["table-id"])
    res['rank'].append(0)
    res['score'].append(cos_early_sim(semantic_table_terms[index], table_terms[index], semantic_query_terms[row["query-id"] - 1], query_terms[row["query-id"] - 1]))
    res['name'].append('STANDARD')

df_res = pd.DataFrame.from_dict(res)

df_res.to_csv(filename + '_early.txt', sep=' ', index=False, header=False)