# Preprocessing

## Parsing

In [1]:
from pybtex.database import parse_file
from nltk.tokenize import RegexpTokenizer

import gensim

import numpy as np



In [2]:
bib_data = parse_file('data/anthology+abstracts.bib')

In [3]:
# check the last entry
list(bib_data.entries.keys())[-1]

'lieberman-etal-1965-automatic'

In [4]:
# number of entries in the anthology
len(list(bib_data.entries.keys()))

70190

In [5]:
# create raw .txt datasets for each of the past 5 years (2016-2021)
for k in bib_data.entries.keys():
    try:
        year = bib_data.entries[k].fields['year']
        abstract = bib_data.entries[k].fields['abstract']
        
        if year > '2015':
            a = open('data/datasets/abstracts.txt', 'a')
            a.write(abstract + '\n')
            a.close()
    
    # corrupted entries / entries without abstracts are skipped
    except (KeyError, UnicodeEncodeError): 
        pass

## Tokenization

In [63]:
# create list of abstracts
with open('data/datasets/abstracts.txt') as f:
    text = f.read()  
    abstracts = text.split('\n')

In [64]:
# example of abstract entry
len(abstracts)

21943

In [65]:
def tokenize(input_text):
    
    # makes text lowercase
    input_lower = input_text.lower()
    
    # only letters (remove numerical and special characters)
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    
    # tokenize text
    tokens = tokenizer.tokenize(input_lower)
    
    return tokens

In [66]:
tokenized = [tokenize(a) for a in abstracts[1::20]]

In [67]:
# example of tokenized abstract
tokenized[42]

['we',
 'introduce',
 'a',
 'new',
 'dataset',
 'for',
 'question',
 'rewriting',
 'in',
 'conversational',
 'context',
 'qrecc',
 'which',
 'contains',
 'k',
 'conversations',
 'with',
 'k',
 'question',
 'answer',
 'pairs',
 'the',
 'task',
 'in',
 'qrecc',
 'is',
 'to',
 'find',
 'answers',
 'to',
 'conversational',
 'questions',
 'within',
 'a',
 'collection',
 'of',
 'm',
 'web',
 'pages',
 'split',
 'into',
 'm',
 'passages',
 'answers',
 'to',
 'questions',
 'in',
 'the',
 'same',
 'conversation',
 'may',
 'be',
 'distributed',
 'across',
 'several',
 'web',
 'pages',
 'qrecc',
 'provides',
 'annotations',
 'that',
 'allow',
 'us',
 'to',
 'train',
 'and',
 'evaluate',
 'individual',
 'subtasks',
 'of',
 'question',
 'rewriting',
 'passage',
 'retrieval',
 'and',
 'reading',
 'comprehension',
 'required',
 'for',
 'the',
 'end',
 'to',
 'end',
 'conversational',
 'question',
 'answering',
 'qa',
 'task',
 'we',
 'report',
 'the',
 'effectiveness',
 'of',
 'a',
 'strong',
 'basel

In [68]:
# total number of tokenized abstracts
len(tokenized)

1098

In [69]:
# single token
tokenized[4][2]

'describes'

In [70]:
# create single list of tokens
tokens = [word for abstract in tokenized for word in abstract]

In [71]:
# check single token
tokens[2021]

'input'

In [72]:
# total number of tokens
len(tokens)

148567

In [73]:
# save to file 
with open('data/tokens.txt', 'w') as f:
    f.write(str(tokens))

In [74]:
# save to file
with open('data/tokenized.txt', 'w') as f:
    f.write(str(tokenized))

## Training data preparation

In [197]:
# TODO: adjust seq_len to avg of abstract length
seq_len = 101

In [189]:
def create_seq(tokens, seq_len):
    for i in range(0, len(tokens), seq_len): 
        yield tokens[i:i + seq_len]
        
seqs = list(create_seq(tokens, seq_len))

In [190]:
# create inputs and targets (x and y)
x = []
y = []

for s in seqs:
    x.append(" ".join(s[:-1]))
    y.append(" ".join(s[1:]))
    
len(x), len(y)

(1471, 1471)

In [191]:
def word_to_id(word):
    return w2v_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_model.wv.index_to_key[id]

word_to_id('nlp'), id_to_word(42)

(111, 'neural')

In [192]:
def get_integer_seq(seq):
    return [word_to_id(w) for w in seq.split()]

# convert text sequences to integer sequences
x = [get_integer_seq(i) for i in x]
y = [get_integer_seq(i) for i in y]

len(x),len(y)

(1471, 1471)

In [193]:
# check length of last sequence, remove if not == seq_len
def check_len(seq):
    if len(seq[-1]) != seq_len-1:
        del seq[-1]
    return seq
    
x = check_len(x_all)
y = check_len(y_all)

len(x), len(y)

(1470, 1470)

In [194]:
# convert lists to numpy arrays
x = np.array(x)
y = np.array(y)

In [195]:
y.shape

(1470, 100)

In [196]:
# save to file
np.save('data/x.npy', x)
np.save('data/y.npy', y)

## Word2Vec

In [132]:
# TODO: adjust word embedding size according to paper findings
w2v_model = gensim.models.Word2Vec(seqs, vector_size=100, min_count=1, window=5, epochs=1)

# save model
w2v_model.save("w2v.model")

In [133]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
print(vocab_size,emdedding_size)

9569 100


In [134]:
example_vector = w2v_model.wv['computer']
print(example_vector)

[-0.02636507  0.02693588  0.01410158  0.00957402  0.00407673 -0.08141849
  0.02094897  0.05846748 -0.03385695 -0.00771476 -0.02724805 -0.04349714
  0.00375009  0.02974946  0.00262331 -0.04681301  0.0263958  -0.03452348
 -0.02680098 -0.06978898  0.02214226  0.01627002  0.02066958 -0.02097843
 -0.00957535  0.00463436 -0.01632814 -0.01811325 -0.03827393  0.00270969
  0.04532893 -0.00361596  0.02638591 -0.04051479 -0.01313958  0.02572526
  0.00330814 -0.03875148 -0.01706738 -0.06009709  0.00505941 -0.02165239
 -0.01808534  0.01643691  0.01767226 -0.00580908 -0.0133606   0.00676859
  0.00964711  0.00359484  0.02938396 -0.04573927 -0.00615245 -0.01930001
 -0.02863558  0.00734415  0.01987244 -0.00836672 -0.05088564  0.01861229
 -0.00103728 -0.00074192  0.00289908  0.03447132 -0.06189742  0.04086243
  0.00753767  0.04772544 -0.05605566  0.02583542  0.00143682  0.04337582
  0.04133814 -0.01416804  0.04785234  0.01296554 -0.01561027  0.0119024
 -0.03500389 -0.00702313 -0.02259577 -0.00241676 -0.

In [135]:
example_similar = w2v_model.wv.most_similar('language', topn=10) 
print(example_similar)

[('for', 0.9997780323028564), ('an', 0.999767005443573), ('of', 0.9997662901878357), ('the', 0.9997624158859253), ('and', 0.999758780002594), ('a', 0.999756932258606), ('from', 0.9997407793998718), ('with', 0.9997388124465942), ('to', 0.9997334480285645), ('in', 0.9997265338897705)]
