# Preprocessing

In [1]:
import re
import numpy as np

from pybtex.database import parse_file

import gensim



## Parsing

In [None]:
bib_data = parse_file('data/anthology+abstracts.bib')

In [None]:
# check the last entry
list(bib_data.entries.keys())[-1]

In [None]:
# number of entries in the anthology
len(list(bib_data.entries.keys()))

In [None]:
# create raw .txt datasets for each of the past 5 years (2016-2021)
for k in bib_data.entries.keys():
    try:
        year = bib_data.entries[k].fields['year']
        abstract = bib_data.entries[k].fields['abstract']
        
        if year > '2015':
            a = open('data/datasets/abstracts.txt', 'a')
            a.write(abstract + '\n')
            a.close()
    
    # corrupted entries / entries without abstracts are skipped
    except (KeyError, UnicodeEncodeError): 
        pass

## Tokenization

In [2]:
# create list of abstracts
with open('data/datasets/abstracts.txt') as f:
    text = f.read()  
    abstracts = text.split('\n')

In [3]:
# example of abstract entry
len(abstracts)

21943

In [4]:
def tokenize(input_text):
    
    # makes text lowercase
    lower = input_text.lower()
    
    # remove numerical characters
    no_num = re.sub(r'\d+', '', lower)
    
    # returns letters and punctuation separately
    # matches apostrophe as part of word
    reg_exp = re.compile(r"\w+(?:'\w+)*|[^\w\s]")
    
    tokens = reg_exp.findall(no_num)
    
    return tokens

In [5]:
# tokenize half of abstracts
tokenized = [tokenize(a) for a in abstracts[1::2]]

In [6]:
# example of tokenized abstract
tokenized[42]

['the',
 'introduction',
 'of',
 'transformer',
 '-',
 'based',
 'language',
 'models',
 'has',
 'been',
 'a',
 'revolutionary',
 'step',
 'for',
 'natural',
 'language',
 'processing',
 '(',
 'nlp',
 ')',
 'research',
 '.',
 'these',
 'models',
 ',',
 'such',
 'as',
 'bert',
 ',',
 'gpt',
 'and',
 'electra',
 ',',
 'led',
 'to',
 'state',
 '-',
 'of',
 '-',
 'the',
 '-',
 'art',
 'performance',
 'in',
 'many',
 'nlp',
 'tasks',
 '.',
 'most',
 'of',
 'these',
 'models',
 'were',
 'initially',
 'developed',
 'for',
 'english',
 'and',
 'other',
 'languages',
 'followed',
 'later',
 '.',
 'recently',
 ',',
 'several',
 'arabic',
 '-',
 'specific',
 'models',
 'started',
 'emerging',
 '.',
 'however',
 ',',
 'there',
 'are',
 'limited',
 'direct',
 'comparisons',
 'between',
 'these',
 'models',
 '.',
 'in',
 'this',
 'paper',
 ',',
 'we',
 'evaluate',
 'the',
 'performance',
 'of',
 'of',
 'these',
 'models',
 'on',
 'arabic',
 'sentiment',
 'and',
 'sarcasm',
 'detection',
 '.',
 'our'

In [7]:
# total number of tokenized abstracts
len(tokenized)

10971

In [8]:
# example of single token
tokenized[4][2]

'approaches'

In [9]:
# create single list of tokens
tokens = [word for abstract in tokenized for word in abstract]

In [10]:
# check single token
tokens[2021]

'can'

In [11]:
# total number of tokens
len(tokens)

1764673

In [12]:
# save to file 
with open('data/tokens.txt', 'w') as f:
    f.write(str(tokens))

In [13]:
# save to file
with open('data/tokenized.txt', 'w') as f:
    f.write(str(tokenized))

## Word embeddings

In [14]:
abs_avg_len = sum(map(len, tokenized))/float(len(tokenized))
abs_avg_len

160.84887430498588

In [15]:
# sequence length is the average length of abstract in corpus
seq_len = int(abs_avg_len)

In [16]:
def create_seq(tokens, seq_len):
    for i in range(0, len(tokens), seq_len): 
        yield tokens[i:i + seq_len]
        
seqs = list(create_seq(tokens, seq_len))

### Word2Vec

In [17]:
w2v_model = gensim.models.Word2Vec(seqs, vector_size=128, min_count=1, window=10, epochs=100)
w2v_model.save('w2v.model')

In [18]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
vocab_size,emdedding_size

(28674, 128)

In [19]:
example_vector = w2v_model.wv['word']
example_vector

array([ 6.6060996e+00, -1.3007619e+00,  3.3873212e+00,  8.6358613e-01,
        9.0070641e-01, -5.8032436e+00, -2.7593493e+00, -1.3732038e+00,
        7.0177517e+00,  5.8316522e+00, -5.1565833e+00, -5.6226168e+00,
        2.1485298e+00,  5.2491462e-01,  3.2060587e-01, -2.8187492e+00,
       -3.7959337e+00, -2.0214845e-01,  3.9961777e+00,  1.5625319e+00,
        2.0373197e+00,  8.3196133e-01,  7.5861788e-01,  4.2176123e+00,
       -9.0352923e-01, -1.1537633e+00,  4.7059399e-01,  1.8872919e+00,
       -1.6310766e-01, -8.7472683e-01,  1.3974133e+00,  3.6043617e-01,
       -6.5412030e+00,  3.1942153e+00, -1.6843088e+00, -6.9167361e-02,
        2.7991202e+00, -1.6959479e+00,  5.2412882e+00,  5.7129925e-01,
       -1.5085722e+00,  1.4122216e+00, -3.5602298e+00,  4.1368194e+00,
       -4.1261029e+00,  2.6524754e+00,  2.4568646e+00,  1.5814502e+00,
       -1.4717021e+00,  1.5965420e+00,  7.4465489e-01,  2.1413019e-01,
       -4.7691889e+00, -1.4005057e+00, -1.3282962e+00, -4.8320908e-03,
      

In [20]:
example_similar = w2v_model.wv.most_similar('word', topn=10) 
example_similar

[('holographic', 0.5947859883308411),
 ('lexicosemantic', 0.556775689125061),
 ('words', 0.5488619208335876),
 ('token', 0.5426555275917053),
 ('sentence', 0.5274664759635925),
 ('multisense', 0.5219385027885437),
 ('character', 0.507339596748352),
 ('trouillon', 0.5060521364212036),
 ('subword', 0.49046769738197327),
 ('morpheme', 0.48854026198387146)]

## Training data preparation

In [21]:
# create inputs and targets (x and y)
x = []
y = []

for s in seqs:
    x.append(" ".join(s[:-1]))
    y.append(" ".join(s[1:]))
    
len(x), len(y)

(11030, 11030)

In [22]:
def word_to_id(word):
    return w2v_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_model.wv.index_to_key[id]

word_to_id('nlp'), id_to_word(42)

(137, 'show')

In [23]:
def get_integer_seq(seq):
    return [word_to_id(w) for w in seq.split()]

# convert text sequences to integer sequences
x = [get_integer_seq(i) for i in x]
y = [get_integer_seq(i) for i in y]

len(x),len(y)

(11030, 11030)

In [24]:
# check length of last sequence, remove if not == seq_len
def check_len(seq):
    if len(seq[-1]) != seq_len-1:
        del seq[-1]
    return seq
    
x = check_len(x)
y = check_len(y)

len(x), len(y)

(11029, 11029)

In [25]:
# convert lists to numpy arrays
x = np.array(x)
y = np.array(y)

In [26]:
# save to file
np.save('data/x.npy', x)
np.save('data/y.npy', y)