## Mount Drive & Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def read_data_file(dir, numeric = False):
    with open(dir, 'r', encoding ='utf-8') as text:
        text_set = []
        for line in text.readlines():
            line = line.split()
            for i in range(len(line)):
                if line[i][0:5] == 'wzjwz':
                    line[i] = '<name>'
            text_set.append(' '.join(line))
    if numeric:
        return [int(i) for i in text_set]
    else:
        return text_set

In [None]:
def read_data_file2(dir):
    with open(dir, 'r', encoding ='utf-8') as f:
        texts = []
        sentiments = []
        topics = []
        for line in f.readlines():
            line = line.split(',')
            text, sentiment, topic = ','.join(line[:-2]).split(), line[-2], line[-1]
            if text[0] == 'sents':
                continue
            else:
                for i in range(len(text)):
                    if text[i][0:5] == 'wzjwz':
                        text[i] = '<name>'
                texts.append(' '.join(text))
                sentiments.append(int(sentiment))
                topics.append(int(topic))

    return texts, sentiments, topics

In [None]:
train_path = 'drive//MyDrive/DeepLearningProject_Group3/_UIT-VSFC/train/'
dev_path = 'drive/MyDrive/DeepLearningProject_Group3/_UIT-VSFC/dev/'
test_path = 'drive/MyDrive/DeepLearningProject_Group3/_UIT-VSFC/test/'


train_texts = read_data_file(train_path + 'sents.txt')
train_sentiments = read_data_file(train_path + 'sentiments.txt', numeric= True)
train_topics = read_data_file(train_path + 'topics.txt', numeric= True)

dev_texts = read_data_file(dev_path + 'sents.txt')
dev_sentiments = read_data_file(dev_path + 'sentiments.txt', numeric= True)
dev_topics = read_data_file(dev_path + 'topics.txt', numeric= True)

test_texts = read_data_file(test_path + 'sents.txt')
test_sentiments = read_data_file(test_path + 'sentiments.txt', numeric= True)
test_topics = read_data_file(test_path + 'topics.txt', numeric= True)

In [None]:
train_path2 = 'drive//MyDrive/DeepLearningProject_Group3/segmented_data/'
dev_path2 = 'drive/MyDrive/DeepLearningProject_Group3/segmented_data/'
test_path2 = 'drive/MyDrive/DeepLearningProject_Group3/segmented_data/'

train_texts_segmented, train_sentiments_segmented, train_topics_segmented = read_data_file2(train_path2 + 'train_segmented.csv')
dev_texts_segmented, dev_sentiments_segmented, dev_topics_segmented = read_data_file2(dev_path2 + 'dev_segmented.csv')
test_texts_segmented, test_sentiments_segmented, test_topics_segmented = read_data_file2(test_path2 + 'test_segmented.csv')

In [None]:
maxlen = max([len(sentence.split()) for sentence in train_texts])
maxlen

159

In [None]:
maxlen_segmented = max([len(sentence.split()) for sentence in train_texts_segmented])
maxlen_segmented

124

## Preprocess data

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from keras.preprocessing.sequence import pad_sequences
import torch 

sequence_size = maxlen
sequence_size_segmented = maxlen_segmented

### Tokenizer encoding

In [None]:
tokenizer = get_tokenizer('basic_english')

vocab = build_vocab_from_iterator(map(tokenizer, train_texts), specials=['<unk>']) 
vocab.set_default_index(vocab['<unk>'])
vocab.append_token('<pad>')

vocab_segmented = build_vocab_from_iterator(map(tokenizer, train_texts_segmented), specials=['<unk>']) 
vocab_segmented.set_default_index(vocab['<unk>'])
vocab_segmented.append_token('<pad>')

In [None]:
len(vocab)

2277

In [None]:
len(vocab_segmented)

3424

In [None]:
def tokenize(text_set, tokenizer, vocab, sequence_size):
    """Converts raw text into a flat Tensor."""
    data = [vocab(tokenizer(item)) for item in text_set]
    data = pad_sequences(data, maxlen= sequence_size, truncating="post", padding="post", dtype="int", value = vocab["<pad>"])
    return data.tolist()

In [None]:
train_tokens = tokenize(train_texts, tokenizer, vocab, sequence_size)
dev_tokens = tokenize(dev_texts, tokenizer, vocab, sequence_size)
test_tokens = tokenize(test_texts, tokenizer, vocab, sequence_size)

In [None]:
train_tokens_segmented = tokenize(train_texts_segmented, tokenizer, vocab_segmented, sequence_size_segmented)
dev_tokens_segmented = tokenize(dev_texts_segmented, tokenizer, vocab_segmented, sequence_size_segmented)
test_tokens_segmented = tokenize(test_texts_segmented, tokenizer, vocab_segmented, sequence_size_segmented)

## Masking for pretrain

In [None]:
vocab.append_token("<mask>")
vocab["<mask>"]

2277

In [None]:
vocab_segmented.append_token("<mask>")
vocab_segmented["<mask>"]

3424

In [None]:
import numpy as np
import copy

In [None]:
def mask_data(data):
    # create random array of floats in equal dimension to input_ids
    rand = np.random.rand(len(data),len(data[0]))
    # where the random array is less than 0.15, we set true
    mask_arr = rand < 0.15

    masked_data = copy.deepcopy(data)

    for i in range(len(data)):
        for j in range(len(data[0])):
            if mask_arr[i,j] and data[i][j]: #only mask the token if the token is not 0
                masked_data[i][j] = vocab["<mask>"]

    return masked_data

In [None]:
masked_train = mask_data(train_tokens)
masked_dev = mask_data(dev_tokens)

In [None]:
masked_train_segmented = mask_data(train_tokens_segmented)
masked_dev_segmented = mask_data(dev_tokens_segmented)

## Save preprocessed data

In [None]:
import os
import json

In [None]:
pretrain = {"masked": masked_train, "labels": train_tokens}
pretrain_dev = {"masked": masked_dev, "labels": dev_tokens}

train = {"tokens": train_tokens, "sentiments": train_sentiments, "topics": train_topics }
dev = {"tokens": dev_tokens, "sentiments": dev_sentiments, "topics": dev_topics }

test = {"tokens": test_tokens, "sentiments": test_sentiments, "topics": test_topics }

In [None]:
pretrain_segmented = {"masked": masked_train_segmented, "labels": train_tokens_segmented}
pretrain_dev_segmented = {"masked": masked_dev_segmented, "labels": dev_tokens_segmented}

train_segmented = {"tokens": train_tokens_segmented, "sentiments": train_sentiments_segmented, "topics": train_topics_segmented }
dev_segmented = {"tokens": dev_tokens_segmented, "sentiments": dev_sentiments_segmented, "topics": dev_topics_segmented }

test_segmented = {"tokens": test_tokens_segmented, "sentiments": test_sentiments_segmented, "topics": test_topics_segmented }

In [None]:
preprocessed_path = 'drive//MyDrive/DeepLearningProject_Group3/preprocessed/preprocessed_for_Transformer/'

In [None]:
with open(preprocessed_path+'pretrain.json','w') as f:
  json.dump(pretrain, f)
with open(preprocessed_path+'pretrain_dev.json','w') as f:
  json.dump(pretrain_dev, f)


with open(preprocessed_path+'train.json','w') as f:
  json.dump(train, f)
with open(preprocessed_path+'dev.json','w') as f:
  json.dump(dev, f)


with open(preprocessed_path+'test.json','w') as f:
  json.dump(test, f)

In [None]:
with open(preprocessed_path+'word_segmented_pretrain.json','w') as f:
  json.dump(pretrain_segmented, f)
with open(preprocessed_path+'word_segmented_pretrain_dev.json','w') as f:
  json.dump(pretrain_dev_segmented, f)

with open(preprocessed_path+'word_segmented_train.json','w') as f:
  json.dump(train_segmented, f)
with open(preprocessed_path+'word_segmented_dev.json','w') as f:
  json.dump(dev_segmented, f)

with open(preprocessed_path+'word_segmented_test.json','w') as f:
  json.dump(test_segmented, f)

In [None]:
torch.save(vocab, preprocessed_path+'vocab.pth')
torch.save(vocab_segmented, preprocessed_path+'word_segmented_vocab.pth')