# Imports

In [1]:
import csv
import re
import numpy as np

# Constants

In [2]:
TRAIN = '../data/interim/train.csv'
DEV   = '../data/interim/dev.csv'
TEST  = '../data/interim/test.csv'

# Functions

In [3]:
def loader(PATH):
    with open(PATH, mode ='r')as file:
        csvFile = csv.reader(file)
        text = []
        for lines in csvFile:
            text.append(lines)
    return text


def splitter(L):
    X = []
    y = []
    for i in L:
        X.append(i[0])
        y.append(i[1])
        
    return X, y

# Load Data

In [4]:
train_data = loader(TRAIN) # Training
dev_data = loader(DEV)     # Validation
X_test = loader(TEST)      # Test

In [5]:
len(train_data), len(dev_data), len(X_test)

(100000, 10000, 10000)

In [6]:
train_data[0]

['Gotta listen to this! So creative!  Love his music - the words, the message! Some of my favorite songs on this CD. I should have bought it years ago!',
 '1']

# Tokenize

In [7]:
def tokenizer(sentence):
    """Function to find all tokens in a given sentence
    """
    tok = re.compile('[\'\"]|[A-Za-z]+|[.?!:\'\"]+')
    
    return tok.findall(sentence)

In [8]:
X_train, y_train = splitter(train_data)
X_dev, y_dev = splitter(dev_data)

In [9]:
# hand-made tokenization
print(tokenizer(X_train[0]))

['Gotta', 'listen', 'to', 'this', '!', 'So', 'creative', '!', 'Love', 'his', 'music', 'the', 'words', 'the', 'message', '!', 'Some', 'of', 'my', 'favorite', 'songs', 'on', 'this', 'CD', '.', 'I', 'should', 'have', 'bought', 'it', 'years', 'ago', '!']


In [10]:
X_train_tokens = []
for sentence in X_train:
    temp = tokenizer(sentence)
    if len(temp) > 0:
        if len(temp) > 500:
            X_train_tokens.append(temp[0:500])
        else: X_train_tokens.append(temp)
    else: X_train_tokens.append('NULL')
print(len(X_train_tokens))

100000


In [11]:
for x in X_train_tokens:
    if len(x) == 0 or len(x) > 500:
        print(x)

In [12]:
for i in X_train_tokens:
    if i == []:
        print('!!!')

In [13]:
X_dev_tokens = []
for sentence in X_dev:
    X_dev_tokens.append(tokenizer(sentence))
print(len(X_dev_tokens))

10000


In [14]:
X_test[0], X_train[0]

(['ok ok'],
 'Gotta listen to this! So creative!  Love his music - the words, the message! Some of my favorite songs on this CD. I should have bought it years ago!')

In [15]:
X_test_tokens = []
for sentence in X_test:
    X_test_tokens.append(tokenizer(str(sentence)))
print(len(X_test_tokens))

10000


## Compare with library tokenizer

In [16]:
#from transformers import AutoTokenizer
#tokzr = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

In [17]:
# AutoTokenizer tokenization
#print(tokzr.tokenize(X_train[0]))

In [18]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 2060 with Max-Q Design


In [36]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            padding='max_length',         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,      # Return attention mask
            truncation = True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [20]:
# Encode our concatenated data
encoded_ = [tokenizer.encode(sent, add_special_tokens=True) for sent in X_train_tokens]

In [39]:
l = 0
for sent in encoded_:
    if len(sent) > l:
        l = len(sent)
        
print(l)

502


In [49]:
# Specify `MAX_LEN`
MAX_LEN = l

# Print sentence 0 and its encoded token ids
token_ids = preprocessing_for_bert(X_train[0:2])
print('Original: ', X_train[0])
print('Token IDs: ', token_ids[0])


# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_dev)
print('F\'ing Done!!')

Original:  Gotta listen to this! So creative!  Love his music - the words, the message! Some of my favorite songs on this CD. I should have bought it years ago!
Token IDs:  tensor([[  101, 10657,  4952,  ...,     0,     0,     0],
        [  101,  9467,  9467,  ...,     0,     0,     0]])
Tokenizing data...
F'ing Done!!


In [50]:
len(train_inputs), len(train_masks), len(val_inputs), len(val_masks)

(100000, 100000, 10000, 10000)

In [51]:
train_inputs

tensor([[  101, 10657,  4952,  ...,     0,     0,     0],
        [  101,  9467,  9467,  ...,     0,     0,     0],
        [  101,  4965,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  2274,  3340,  ...,     0,     0,     0],
        [  101,  2919,  3617,  ...,     0,     0,     0],
        [  101,  1012,  1012,  ...,     0,     0,     0]])