# Preprocessing

## Parsing

In [1]:
from pybtex.database import parse_file
from nltk.tokenize import RegexpTokenizer

import gensim

import numpy as np



In [2]:
bib_data = parse_file('data/anthology+abstracts.bib')

In [3]:
# check the last entry
list(bib_data.entries.keys())[-1]

'lieberman-etal-1965-automatic'

In [4]:
# number of entries in the anthology
len(list(bib_data.entries.keys()))

70190

In [5]:
# create raw .txt datasets for each of the past 5 years (2016-2021)
for k in bib_data.entries.keys():
    try:
        year = bib_data.entries[k].fields['year']
        abstract = bib_data.entries[k].fields['abstract']
        
        if year > '2015':
            a = open('data/datasets/abstracts.txt', 'a')
            a.write(abstract + '\n')
            a.close()
    
    # corrupted entries / entries without abstracts are skipped
    except (KeyError, UnicodeEncodeError): 
        pass

## Tokenization

In [6]:
# create list of abstracts
with open('data/datasets/abstracts.txt') as f:
    text = f.read()  
    abstracts = text.split('\n')

In [25]:
# example of abstract entry
len(abstracts)

21943

In [26]:
def tokenize(input_text):
    
    # makes text lowercase
    input_lower = input_text.lower()
    
    # only letters (remove numerical and special characters)
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    
    # tokenize text
    tokens = tokenizer.tokenize(input_lower)
    
    return tokens

In [27]:
tokenized = [tokenize(a) for a in abstracts[1::20]]

In [28]:
# example of tokenized abstract
tokenized[42]

['we',
 'introduce',
 'a',
 'new',
 'dataset',
 'for',
 'question',
 'rewriting',
 'in',
 'conversational',
 'context',
 'qrecc',
 'which',
 'contains',
 'k',
 'conversations',
 'with',
 'k',
 'question',
 'answer',
 'pairs',
 'the',
 'task',
 'in',
 'qrecc',
 'is',
 'to',
 'find',
 'answers',
 'to',
 'conversational',
 'questions',
 'within',
 'a',
 'collection',
 'of',
 'm',
 'web',
 'pages',
 'split',
 'into',
 'm',
 'passages',
 'answers',
 'to',
 'questions',
 'in',
 'the',
 'same',
 'conversation',
 'may',
 'be',
 'distributed',
 'across',
 'several',
 'web',
 'pages',
 'qrecc',
 'provides',
 'annotations',
 'that',
 'allow',
 'us',
 'to',
 'train',
 'and',
 'evaluate',
 'individual',
 'subtasks',
 'of',
 'question',
 'rewriting',
 'passage',
 'retrieval',
 'and',
 'reading',
 'comprehension',
 'required',
 'for',
 'the',
 'end',
 'to',
 'end',
 'conversational',
 'question',
 'answering',
 'qa',
 'task',
 'we',
 'report',
 'the',
 'effectiveness',
 'of',
 'a',
 'strong',
 'basel

In [29]:
# total number of tokenized abstracts
len(tokenized)

1098

In [55]:
# single token
tokenized[4][2]

'describes'

In [30]:
# create single list of tokens
tokens = [word for abstract in tokenized for word in abstract]

In [31]:
# check single token
tokens[2021]

'input'

In [32]:
# total number of tokens
len(tokens)

148567

In [53]:
# save to file 
with open('data/tokens.txt', 'w') as f:
    f.write(str(tokens))

In [54]:
# save to file
with open('data/tokenized.txt', 'w') as f:
    f.write(str(tokenized))

## Word2Vec

In [42]:
max_sentence_len = 100

sent_split = zip(*[iter(tokens)] * max_sentence_len)
sentences = [list(s) for s in sent_split]

In [44]:
# adjust word embedding size according to paper findings

w2v_model = gensim.models.Word2Vec(sentences, vector_size=100, min_count=1, window=5, epochs=1)

# save model
w2v_model.save("w2v.model")

In [45]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
print(vocab_size,emdedding_size)

9567 100


In [46]:
example_vector = w2v_model.wv['computer']
print(example_vector)

[-0.05340158  0.02590642  0.01503839  0.02531578  0.00643208 -0.06506889
  0.03091099  0.06639995 -0.0208962  -0.02296234 -0.02905319 -0.0500361
  0.01018123  0.02076851  0.01678655 -0.04894024  0.02409277 -0.04112661
 -0.01375142 -0.0844128   0.02889113  0.01567878  0.0188634  -0.02133239
  0.01525009  0.00229912 -0.02622324 -0.02251646 -0.04431868  0.00872653
  0.0675933  -0.0094258   0.02700188 -0.0586458  -0.037376    0.04670507
  0.00821736 -0.06266262 -0.0414429  -0.06861643 -0.01216607 -0.03952619
 -0.0149381   0.02402886  0.01373445  0.00380556 -0.0202361   0.00073795
  0.00867628  0.00627567  0.0277653  -0.05044303 -0.01215924 -0.02200374
 -0.03629857  0.0133618   0.02861061 -0.02262054 -0.04472381  0.00937136
  0.0161874   0.00357403 -0.00137439  0.02344876 -0.05799637  0.02631601
  0.03597176  0.05481634 -0.08044028  0.03455045  0.00136511  0.05876053
  0.05654617  0.01024706  0.0549088   0.02575218 -0.01714257  0.01101278
 -0.03688185  0.01040825 -0.02903974 -0.01382768 -0.

In [47]:
example_similar = w2v_model.wv.most_similar('language', topn=10) 
print(example_similar)

[('for', 0.9998010396957397), ('of', 0.9997996091842651), ('and', 0.9997888207435608), ('an', 0.9997869729995728), ('the', 0.9997761845588684), ('a', 0.9997751116752625), ('from', 0.9997710585594177), ('with', 0.9997611045837402), ('to', 0.999756395816803), ('using', 0.9997522830963135)]


## Training data preparation

In [49]:
x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
y = np.zeros([len(sentences)], dtype=np.int32)

for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence[:-1]):
        x[i, t] = word_to_id(word)
    y[i] = word_to_id(sentence[-1])

print('x shape:', x.shape)
print('y shape:', y.shape)

x shape: (1485, 100)
y shape: (1485,)


In [51]:
# save to file
np.save('data/x.npy', x)
np.save('data/y.npy', y)