# Preprocessing

In [1]:
from pybtex.database import parse_file

from nltk.tokenize import RegexpTokenizer

import gensim

import numpy as np



## Parsing

In [2]:
bib_data = parse_file('data/anthology+abstracts.bib')

In [3]:
# check the last entry
list(bib_data.entries.keys())[-1]

'lieberman-etal-1965-automatic'

In [4]:
# number of entries in the anthology
len(list(bib_data.entries.keys()))

70190

In [5]:
# create raw .txt datasets for each of the past 5 years (2016-2021)
for k in bib_data.entries.keys():
    try:
        year = bib_data.entries[k].fields['year']
        abstract = bib_data.entries[k].fields['abstract']
        
        if year > '2015':
            a = open('data/datasets/abstracts.txt', 'a')
            a.write(abstract + '\n')
            a.close()
    
    # entries with non-Unicode characters / entries without abstracts are skipped
    except (KeyError, UnicodeEncodeError): 
        pass

## Tokenization

In [6]:
# create list of abstracts
with open('data/datasets/abstracts.txt') as f:
    text = f.read()  
    abstracts = text.split('\n')

In [7]:
# total number of abstracts
len(abstracts)

87769

In [8]:
def tokenize(input_text):
    
    # makes text lowercase
    input_lower = input_text.lower()
    
    # only letters (remove numerical and special characters)
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    
    # tokenize text
    tokens = tokenizer.tokenize(input_lower)
    
    return tokens

In [9]:
# tokenize 20% of abstracts
tokenized = [tokenize(a) for a in abstracts[::5]] 

In [10]:
# example of tokenized abstract
tokenized[42]

['we',
 'present',
 'a',
 'scaffolded',
 'discovery',
 'learning',
 'approach',
 'to',
 'introducing',
 'concepts',
 'in',
 'a',
 'natural',
 'language',
 'processing',
 'course',
 'aimed',
 'at',
 'computer',
 'science',
 'students',
 'at',
 'liberal',
 'arts',
 'institutions',
 'we',
 'describe',
 'some',
 'of',
 'the',
 'objectives',
 'of',
 'this',
 'approach',
 'as',
 'well',
 'as',
 'presenting',
 'specific',
 'ways',
 'that',
 'four',
 'of',
 'our',
 'discovery',
 'based',
 'assignments',
 'combine',
 'specific',
 'natural',
 'language',
 'processing',
 'concepts',
 'with',
 'broader',
 'analytic',
 'skills',
 'we',
 'argue',
 'this',
 'approach',
 'helps',
 'prepare',
 'students',
 'for',
 'many',
 'possible',
 'future',
 'paths',
 'involving',
 'both',
 'application',
 'and',
 'innovation',
 'of',
 'nlp',
 'technology',
 'by',
 'emphasizing',
 'experimental',
 'data',
 'navigation',
 'experiment',
 'design',
 'and',
 'awareness',
 'of',
 'the',
 'complexities',
 'and',
 'chall

In [11]:
# total number of tokenized abstracts
len(tokenized)

17554

In [12]:
# example of single token
tokenized[4][2]

'the'

In [13]:
# create single list of tokens
tokens = [word for abstract in tokenized for word in abstract]

In [14]:
# check single token
tokens[2021]

'than'

In [15]:
# total number of tokens
len(tokens)

2362213

In [16]:
# save tokens to file 
with open('data/tokens.txt', 'w') as f:
    f.write(str(tokens))

In [17]:
# save tokenized to file
with open('data/tokenized.txt', 'w') as f:
    f.write(str(tokenized))

## Word embeddings

In [18]:
# average words per tokenized abstract
abs_avg_len = sum(map(len, tokenized))/float(len(tokenized))
abs_avg_len

134.56836048763813

In [19]:
# sequence length is the average length of tokenized abstract in corpus
seq_len = int(abs_avg_len)

In [20]:
# create sequences from tokens
def create_seq(tokens, seq_len):
    for i in range(0, len(tokens), seq_len): 
        yield tokens[i:i + seq_len]
        
seqs = list(create_seq(tokens, seq_len))

### Word2Vec

In [21]:
# train and save word2vec model
w2v_model = gensim.models.Word2Vec(seqs, vector_size=128, min_count=1, window=10, epochs=100)
w2v_model.save('w2v.model')

In [22]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
vocab_size,emdedding_size

(33831, 128)

In [23]:
example_vector = w2v_model.wv['word']
example_vector

array([  0.5153201 ,   0.04374544,   4.3532104 ,  -3.0908656 ,
         7.378356  ,   1.7359018 ,   2.2519982 ,  -3.5178337 ,
         1.789913  ,  -1.1127359 ,   3.608325  ,   2.295705  ,
         1.4197519 ,  -0.11039281,  -0.36852434,   0.3635096 ,
        -3.6040933 ,  -2.6250308 ,  -4.045992  ,   3.8224006 ,
        -2.2330127 ,  -0.20725523,   0.49607345,  -2.8950317 ,
        -1.9173801 ,   1.1601223 ,   1.636696  ,   3.1894147 ,
         3.6535957 ,   2.733748  ,   1.313707  ,  -0.9115405 ,
         1.1350554 ,  -2.550564  ,  -0.46035883,   0.513885  ,
         2.6159823 ,  -0.7121785 ,   4.6503906 ,   0.3735015 ,
        -4.649176  ,  -2.8768854 ,   0.2758945 ,  -4.438189  ,
        -0.05888746,   0.33716238,  -1.0184946 ,   1.1004277 ,
        -2.7221441 ,   3.8740125 ,   1.5404776 ,  -1.9539399 ,
        -4.6756444 ,   1.3007131 ,  -1.3804142 ,   1.468619  ,
         0.44544432,  -3.7589514 ,   1.4479334 ,  -3.114318  ,
        -1.2487234 ,   0.6174153 ,  -0.44737792,   0.92

In [24]:
example_similar = w2v_model.wv.most_similar('word', topn=10) 
example_similar

[('holographic', 0.5947607159614563),
 ('attr', 0.5925477147102356),
 ('sentence', 0.5900499820709229),
 ('token', 0.5833660960197449),
 ('character', 0.5825883746147156),
 ('words', 0.560817539691925),
 ('glove', 0.5558523535728455),
 ('trouillon', 0.5343304872512817),
 ('wav', 0.511766791343689),
 ('imputation', 0.4909842610359192)]

## Training data preparation

In [25]:
# create inputs and targets (x and y)
x = []
y = []

for s in seqs:
    x.append(" ".join(s[:-1]))
    y.append(" ".join(s[1:]))
    
len(x), len(y)

(17629, 17629)

In [26]:
def word_to_id(word):
    return w2v_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_model.wv.index_to_key[id]

word_to_id('nlp'), id_to_word(42)

(121, 'propose')

In [27]:
def get_integer_seq(seq):
    return [word_to_id(w) for w in seq.split()]

# convert text sequences to integer sequences
x = [get_integer_seq(i) for i in x]
y = [get_integer_seq(i) for i in y]

len(x),len(y)

(17629, 17629)

In [28]:
# check length of last sequence, remove if not == seq_len
def check_len(seq):
    if len(seq[-1]) != seq_len-1:
        del seq[-1]
    return seq
    
x = check_len(x)
y = check_len(y)

len(x), len(y)

(17628, 17628)

In [29]:
# convert lists to numpy arrays
x = np.array(x)
y = np.array(y)

In [30]:
# save x and y to file
np.save('data/x.npy', x)
np.save('data/y.npy', y)