# Imports

In [1]:
import csv
import re

# Constants

In [2]:
TRAIN = '../data/interim/train.csv'
DEV   = '../data/interim/dev.csv'
TEST  = '../data/interim/test.csv'

# Functions

In [3]:
def loader(PATH):
    with open(PATH, mode ='r')as file:
        csvFile = csv.reader(file)
        text = []
        for lines in csvFile:
            text.append(lines)
    return text


def splitter(L):
    X = []
    y = []
    for i in L:
        X.append(i[0])
        y.append(i[1])
        
    return X, y

# Load Data

In [4]:
train_data = loader(TRAIN) # Training
dev_data = loader(DEV)     # Validation
X_test = loader(TEST)      # Test

In [5]:
len(train_data), len(dev_data), len(X_test)

(100000, 10000, 10000)

In [6]:
train_data[0]

['Gotta listen to this! So creative!  Love his music - the words, the message! Some of my favorite songs on this CD. I should have bought it years ago!',
 '1']

# Tokenize

In [7]:
def tokenizer(sentence):
    """Function to find all tokens in a given sentence
    """
    tok = re.compile('[\'\"]|[A-Za-z]+|[.?!:\'\"]+')
    
    return tok.findall(sentence)

In [8]:
X_train, y_train = splitter(train_data)
X_dev, y_dev = splitter(dev_data)

In [9]:
# hand-made tokenization
print(tokenizer(X_train[0]))

['Gotta', 'listen', 'to', 'this', '!', 'So', 'creative', '!', 'Love', 'his', 'music', 'the', 'words', 'the', 'message', '!', 'Some', 'of', 'my', 'favorite', 'songs', 'on', 'this', 'CD', '.', 'I', 'should', 'have', 'bought', 'it', 'years', 'ago', '!']


In [10]:
X_train_tokens = []
for sentence in X_train:
    X_train_tokens.append(tokenizer(sentence))
print(len(X_train_tokens))

100000


In [11]:
X_dev_tokens = []
for sentence in X_dev:
    X_dev_tokens.append(tokenizer(sentence))
print(len(X_dev_tokens))

10000


In [12]:
X_test[0], X_train[0]

(['ok ok'],
 'Gotta listen to this! So creative!  Love his music - the words, the message! Some of my favorite songs on this CD. I should have bought it years ago!')

In [13]:
X_test_tokens = []
for sentence in X_test:
    X_test_tokens.append(tokenizer(str(sentence)))
print(len(X_test_tokens))

10000


## Compare with library tokenizer

In [14]:
from transformers import AutoTokenizer
tokzr = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

In [15]:
# AutoTokenizer tokenization
print(tokzr.tokenize(X_train[0]))

['Gott', '##a', 'listen', 'to', 'this', '!', 'So', 'creative', '!', 'Love', 'his', 'music', '-', 'the', 'words', ',', 'the', 'message', '!', 'Some', 'of', 'my', 'favorite', 'songs', 'on', 'this', 'CD', '.', 'I', 'should', 'have', 'bought', 'it', 'years', 'ago', '!']
