In [1]:
import json
import numpy as np
from collections import Counter
import string
import pickle
from tqdm import tqdm
import re

In [2]:
import spacy
nlp = spacy.blank("en")

In [3]:
train_data_in = "../data/semi_supervised_splits/train.json"
val_data_in = "../data/semi_supervised_splits/val.json"
test_data_in = "../data/test.json"

train_data_out = "../preprocessed_data/semi_superviesd/preprocessed_train.pkl"
val_data_out = "../preprocessed_data/semi_superviesd/preprocessed_val.pkl"
test_data_out = "../preprocessed_data/semi_superviesd/preprocessed_test.pkl"

glove_in = "/Users/gpsbhargav/projects/glove.840B.300d.txt"
glove_out = "../preprocessed_data/semi_superviesd/embedding_matrix"

max_seq_len = 250

pad_symbol = "<pad>"
unk_symbol = "<unk>"

In [4]:
def pickler(pkl_file, obj):
    with open(pkl_file, "wb") as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def unpickler(pkl_file):
    with open(pkl_file, "rb") as f:
        obj = pickle.load(f)
    return obj

In [5]:
class Vocabulary:
    
    def __init__(self, unk = '<unk>', pad='<pad>', other_special_symbols=None):
        self.vocab = Counter([])
        self.word_to_id = {}
        self.id_to_word = {}
        self.min_word_count = 2
        self.unk = unk
        self.pad = pad
        
        self.word_to_id[pad] = 0
        self.word_to_id[unk] = 1
        
        self.id_to_word[0] = pad
        self.id_to_word[1] = unk
        
        if(other_special_symbols is not None):
            for i,symbol in enumerate(other_special_symbols):
                self.id_to_word[len(self.id_to_word)] = symbol
                self.word_to_id[symbol] = len(self.word_to_id)
        
        
    def fit(self,text):
        self.vocab.update(text)
    
    def freeze_vocab(self, min_word_count = 5):
        self.min_word_count = min_word_count
        sorted_counts = sorted(self.vocab.items(), key=lambda x: x[1], reverse = True)
        sorted_counts_filtered = [item for item in sorted_counts if item[1] >= self.min_word_count]
        for i, item in enumerate(sorted_counts_filtered):
            if(item[0] not in self.word_to_id.keys()):
                self.id_to_word[len(self.id_to_word)] = item[0]
                self.word_to_id[item[0]] = len(self.word_to_id)
            
    
    def transform_sent(self, text):
        return [self.word_to_id.get(item, self.word_to_id[self.unk]) for item in text]
    
    def batch_transform(self, text_list):
        out = []
        for text in text_list:
            out.append(self.transform_sent(text))
        return out

In [6]:
def read_json(file_path):
    with open(file_path) as in_file:
        contents = json.load(in_file)
    return contents

In [7]:
def normalize(text):
    text = re.sub(
            r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", 
            str(text))
    text = re.sub(r"[ ]+", " ", text)
    text = re.sub(r"\!+", "!", text)
    text = re.sub(r"\,+", ",", text)
    text = re.sub(r"\?+", "?", text)
    text = text.lower().strip()
    return text

In [8]:
def tokenize(text):
    return [x.text for x in nlp.tokenizer(normalize(text)) if x.text != " "]

# def tokenize(text):
#     return [x.text for x in nlp.tokenizer(text) if x.text != " "]

In [9]:
print(tokenize("Hello world! I can't fly (In case you didn't know)."))

['hello', 'world', 'i', 'ca', "n't", 'fly', 'in', 'case', 'you', 'did', "n't", 'know', '.']


In [10]:
training_data = read_json(train_data_in)

In [11]:
len(training_data)

20000

In [12]:
training_data[0].keys()

dict_keys(['id', 'text', 'label'])

In [13]:
val_data = read_json(val_data_in)

In [14]:
len(val_data)

5000

In [15]:
test_data = read_json(test_data_in)

In [16]:
len(test_data)

25000

## Create vocabulary

In [17]:
vocab = Vocabulary(unk = unk_symbol, pad=pad_symbol)

In [18]:
for item in tqdm(training_data):
    text = item["text"]
    text_tokenized = tokenize(text)
    vocab.fit(text_tokenized)

100%|██████████| 20000/20000 [00:44<00:00, 444.64it/s]


In [19]:
len(vocab.vocab)

80769

In [20]:
vocab.freeze_vocab()

In [21]:
len(vocab.id_to_word)

27574

In [22]:
test_text = "Hello world! I can't fly (In case you didn't know)."

In [23]:
test_text_tokenized = tokenize(test_text)
print(test_text_tokenized)

['hello', 'world', 'i', 'ca', "n't", 'fly', 'in', 'case', 'you', 'did', "n't", 'know', '.']


In [24]:
test_text_ids = vocab.transform_sent(test_text_tokenized)
print(test_text_ids)

[4682, 186, 13, 185, 28, 2400, 12, 430, 27, 80, 28, 130, 4]


In [25]:
reconstructed_test_text = []
for idx in test_text_ids:
    word = vocab.id_to_word[idx]
    reconstructed_test_text.append(word)
    

In [26]:
print(reconstructed_test_text)

['hello', 'world', 'i', 'ca', "n't", 'fly', 'in', 'case', 'you', 'did', "n't", 'know', '.']


In [27]:
pad_id = vocab.word_to_id[pad_symbol]
unk_id = vocab.word_to_id[unk_symbol]

In [28]:
print(pad_id)
print(unk_id)

0
1


## Preprocess data

In [29]:
def pad_trim(sequence, max_len, pad_id):
    ''' Puts padding before actual data '''
    seq = sequence[:max_len]
    mask = [1] * len(seq)
    seq = [pad_id] * (max_len - len(seq)) + seq
    mask = [pad_id] * (max_len - len(mask)) + mask
    assert len(seq) == len(mask)
    return seq, mask

In [30]:
pad_trim(sequence=[1,2,3], max_len=5, pad_id=0)

([0, 0, 1, 2, 3], [0, 0, 1, 1, 1])

In [31]:
def process_one_example(data, vocab, pad_id, max_seq_len):
    ''' 
    Outputs:
    {
        "id": example_id
        "text": tokenized text,
        "label": either 0 or 1. 0 is negative, 1 is positive,
        "word_mask": binary vector. 0 denotes padding. 1 denotes given words
        "num_tokens": number of tokens present in text
    }
    '''
    out_data = {}
    text_raw = data["text"]
    text_tokenized = tokenize(text_raw)
    text_ids = vocab.transform_sent(text_tokenized)
    text_ids_fixed_len, mask = pad_trim(sequence=text_ids, max_len=max_seq_len, 
                                        pad_id=pad_id)
    num_tokens = sum(mask)
    label = 0 if data["label"] == 'neg' else 1
    if(data["label"] == 'neg'):
        label = 0
    elif(data["label"] == 'pos'):
        label = 1
    else:
        label = -1
    
    out_data["id"] = data["id"]
    out_data["text"] = text_ids_fixed_len
    out_data["word_mask"] = mask
    out_data["label"] = label
    out_data["num_tokens"] = num_tokens
    
    return out_data

In [32]:
text_example = {"id": 123, "text": "Hello world! I still can't fly.", "label":'neg' }

In [33]:
processed_test_example = process_one_example(data=text_example, vocab=vocab, 
                    pad_id=pad_id, max_seq_len=max_seq_len)

In [34]:
processed_test_example.keys()

dict_keys(['id', 'text', 'word_mask', 'label', 'num_tokens'])

In [35]:
print(processed_test_example["id"])
print(processed_test_example["label"])
print(processed_test_example["num_tokens"])

123
0
8


In [36]:
print(processed_test_example["text"][-15:])

[0, 0, 0, 0, 0, 0, 0, 4682, 186, 13, 135, 185, 28, 2400, 4]


In [37]:
print(processed_test_example["word_mask"][-15:])

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]


In [38]:
def process_all_data(all_data, vocab, pad_id, max_seq_len):
    out_list = []
    for data in tqdm(all_data):
        processed_data = process_one_example(data=data, vocab=vocab, 
                    pad_id=pad_id, max_seq_len=max_seq_len)
        out_list.append(processed_data)
    return out_list

In [39]:
processed_training_data = process_all_data(training_data, vocab, pad_id, max_seq_len)

100%|██████████| 20000/20000 [00:46<00:00, 434.28it/s]


In [40]:
assert len(processed_training_data) == len(training_data)

In [41]:
processed_val_data = process_all_data(val_data, vocab, pad_id, max_seq_len)

100%|██████████| 5000/5000 [00:10<00:00, 472.04it/s]


In [42]:
assert len(processed_val_data) == len(val_data)

In [43]:
processed_test_data = process_all_data(test_data, vocab, pad_id, max_seq_len)

100%|██████████| 25000/25000 [00:52<00:00, 472.19it/s]


In [44]:
assert len(processed_test_data) == len(test_data)

In [45]:
pickler(pkl_file=train_data_out, obj=processed_training_data)

In [46]:
pickler(pkl_file=val_data_out, obj=processed_val_data)

In [47]:
pickler(pkl_file=test_data_out, obj=processed_test_data)

## Prepare GloVe embedding matrix

In [48]:
EMBEDDING_SIZE = 300
VOCAB_SIZE = len(vocab.word_to_id)
embeddings_index = {}
f = open(glove_in,encoding='utf8')
for line in tqdm(f):
      values = line.split(' ')
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
f.close()

print("Read GloVe file")

# make sure GloVE doesn't have <unk> and <pad>.  NOTE: These will be handled separately later
# assert(embeddings_index.get('<pad>',-10) == -10)
# assert(embeddings_index.get('<unk>',-10) == -10)


# prepare embedding matrix
print("Preparing embedding matrix")
count_not_found = 0
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_SIZE))
for word, i in vocab.word_to_id.items():
    if((word == '<unk>') or (word == '<pad>')):
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        count_not_found += 1

# initialize <unk> to mean of all embeddings
embedding_matrix[vocab.word_to_id['<unk>']] = embedding_matrix.mean(axis = 0)

print("Embedding matrix shape: ",embedding_matrix.shape)  
print("Number of words not found in GloVe: ",count_not_found)
print("Number of words in GloVe: ", len(embeddings_index))
np.save(glove_out, embedding_matrix)
print("Done")

2196017it [03:23, 10796.77it/s]


Read GloVe file
Preparing embedding matrix
Embedding matrix shape:  (27574, 300)
Number of words not found in GloVe:  1979
Number of words in GloVe:  2196016
Done
