In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import pickle

# Preprocessing

In [None]:
train_path = 'drive//MyDrive/DeepLearningProject_Group3/_UIT-VSFC/train'
dev_path = 'drive/MyDrive/DeepLearningProject_Group3/_UIT-VSFC/dev'
test_path = 'drive/MyDrive/DeepLearningProject_Group3/_UIT-VSFC/test'

with open(os.path.join(train_path, 'sents.txt')) as f:
  train_raw = f.read().splitlines()
with open(os.path.join(train_path, 'sentiments.txt')) as f:
  train_sentiments = list(map(int, f.read().splitlines()))
with open(os.path.join(train_path, 'topics.txt')) as f:
  train_topics = list(map(int, f.read().splitlines()))

with open(os.path.join(dev_path, 'sents.txt')) as f:
  dev_raw = f.read().splitlines()
with open(os.path.join(dev_path, 'sentiments.txt')) as f:
  dev_sentiments = list(map(int, f.read().splitlines()))
with open(os.path.join(dev_path, 'topics.txt')) as f:
  dev_topics = list(map(int, f.read().splitlines()))

with open(os.path.join(test_path, 'sents.txt')) as f:
  test_raw = f.read().splitlines()
with open(os.path.join(test_path, 'sentiments.txt')) as f:
  test_sentiments = list(map(int, f.read().splitlines()))
with open(os.path.join(test_path, 'topics.txt')) as f:
  test_topics = list(map(int, f.read().splitlines()))

In [None]:
if True:
  train_path = 'drive//MyDrive/DeepLearningProject_Group3/segmented_data'
  dev_path = 'drive/MyDrive/DeepLearningProject_Group3/segmented_data'
  test_path = 'drive/MyDrive/DeepLearningProject_Group3/segmented_data'

  train = pd.read_csv(os.path.join(train_path, 'train_segmented.csv'))
  train_raw = train['sents'].tolist()
  train_sentiments = train['labels'].tolist()
  train_topics = train['topics'].tolist()

  dev = pd.read_csv(os.path.join(dev_path, 'dev_segmented.csv'))
  dev_raw = dev['sents'].tolist()
  dev_sentiments = dev['labels'].tolist()
  dev_topics = dev['topics'].tolist()

  test = pd.read_csv(os.path.join(test_path, 'test_segmented.csv'))
  test_raw = test['sents'].tolist()
  test_sentiments = test['labels'].tolist()
  test_topics = test['topics'].tolist()

In [None]:
count = 0
vocabulary = {}
vocabulary['<pad>'] = count
count += 1
vocabulary['<unknown>'] = count
for sentence in train_raw:
  words = sentence.split()
  for word in words:
    if 'wzjwz' in word:
      word = '<name>'
    if word not in vocabulary.keys():
      vocabulary[word] = count
      count += 1

In [None]:
def tokenizer_encode(raw_data, vocabulary):
  encoding = []
  for sentence in raw_data:
    words = sentence.split()
    sentence_te = []
    for word in words:
      try:
        sentence_te.append(vocabulary[word])
      except:
        if 'wzjwz' in word:
          sentence_te.append(vocabulary['<name>'])
        else:
          sentence_te.append(vocabulary['<unknown>'])
    encoding.append(sentence_te)
  return encoding

In [None]:
dev_encoded = tokenizer_encode(dev_raw, vocabulary)
test_encoded = tokenizer_encode(test_raw, vocabulary)

In [None]:
def tokenizer_encoding_pad(data, vocabulary, seq_length):
  for sentence in data:
    pad_length = seq_length - len(sentence)
    if pad_length > 0:
      for i in range(pad_length):
        sentence.append(vocabulary['<pad>'])
    elif pad_length < 0:
      for i in range(-pad_length):
        sentence.pop()
  return data

In [None]:
dev_encoded = tokenizer_encoding_pad(dev_encoded, vocabulary, seq_length=14)
test_encoded = tokenizer_encoding_pad(test_encoded, vocabulary, seq_length=14)

In [None]:
class SentenceDataset(Dataset):
  def __init__(self, sents, sentiments, topics):
    super(SentenceDataset, self).__init__()
    self.sents = torch.FloatTensor(sents)
    if len(self.sents.shape) == 2:
      self.sents = self.sents.reshape((self.sents.shape[0],
                                       self.sents.shape[1],
                                       1))
    self.sentiments = torch.tensor(sentiments, dtype=torch.long)
    self.topics = torch.tensor(topics, dtype=torch.long)

  def __len__(self):
    return len(self.sentiments)
  
  def __getitem__(self, idx):
    return self.sents[idx], self.sentiments[idx], self.topics[idx]

In [None]:
rnn_dev_dataset = SentenceDataset(sents=dev_encoded,
                                  sentiments=dev_sentiments,
                                  topics=dev_topics)
rnn_test_dataset = SentenceDataset(sents=test_encoded,
                                   sentiments=test_sentiments,
                                   topics=test_topics)

In [None]:
rnn_dev_loader = DataLoader(rnn_dev_dataset,
                            batch_size=128)
rnn_test_loader = DataLoader(rnn_test_dataset,
                             batch_size=128)

# Model

In [None]:
class DeepRNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size, 
               embedding, vocab_size, embedding_dim):
    super(DeepRNN, self).__init__()
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.embedding = embedding
    self.embedding_dim = embedding_dim
    if self.embedding:
      assert self.input_size == 1, 'Input must be a list of indices'
      assert isinstance(vocab_size, int), 'vocab_size must be int'
      assert isinstance(embedding_dim, int), 'embedding_dim must be int'
      self.embedding_layer = nn.Embedding(vocab_size, embedding_dim,
                                          max_norm=1, device='cuda')
      self.input_size = embedding_dim
    self.input_to_hidden = nn.Linear(self.input_size,
                                     self.hidden_size,
                                     device=self.device)
    self.h2h_same_layer_1 = nn.Linear(self.hidden_size,
                                      self.hidden_size,
                                      device=self.device)
    self.h2h_between_layers_12 = nn.Linear(self.hidden_size,
                                           self.hidden_size,
                                           device=self.device)
    self.h2h_same_layer_2 = nn.Linear(self.hidden_size,
                                      self.hidden_size,
                                      device=self.device)
    self.h2h_between_layers_23 = nn.Linear(self.hidden_size,
                                           self.hidden_size,
                                           device=self.device)
    self.h2h_same_layer_3 = nn.Linear(self.hidden_size,
                                      self.hidden_size,
                                      device=self.device)
    self.hidden_to_output = nn.Linear(self.hidden_size,
                                      self.output_size,
                                      device=self.device)
    self.tanh = nn.Tanh()
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, input_tensor):
    if self.embedding:
      input_tensor = self.embedding_layer(input_tensor.long()).squeeze()
    # Each layer has a hidden state
    hidden_1 = torch.zeros((1, self.hidden_size), device=self.device)
    hidden_2 = torch.zeros((1, self.hidden_size), device=self.device)
    hidden_3 = torch.zeros((1, self.hidden_size), device=self.device)
    for word in input_tensor:
      hidden_1 = self.input_to_hidden(word) + self.h2h_same_layer_1(hidden_1)
      hidden_1 = self.tanh(hidden_1)
      hidden_2 = self.h2h_between_layers_12(hidden_1) + self.h2h_same_layer_2(hidden_2)
      hidden_2 = self.tanh(hidden_2)
      hidden_3 = self.h2h_between_layers_23(hidden_2) + self.h2h_same_layer_3(hidden_3)
      hidden_3 = self.tanh(hidden_3)
      output = self.hidden_to_output(hidden_3)
    # Only the output of the last RNN cell is taken into account
    output = self.softmax(output)
    # The log-softmax function combined with negative log likelihood loss
    # gives the same effect as cross entropy loss taken straight from the output
    return output

  def predict(self, input_tensor):
    with torch.no_grad():
      prediction = torch.argmax(self.forward(input_tensor))
    return prediction

In [None]:
rnn_path = '/content/drive/MyDrive/DeepLearningProject_Group3/checkpoint/RNN/DeepRNN_word_segmented.pth'
model = torch.load(rnn_path)

In [None]:
def predict(model, data_loader):
  prediction = None
  with torch.no_grad():
      for data, sentiments, _ in data_loader:
        data = data.transpose(0, 1)
        data, sentiments = data.to('cuda'), sentiments.to('cuda')
        output = model(data)
        if prediction is not None:
          prediction = np.concatenate((prediction, output.cpu().detach().numpy()))
        else:
          prediction = output.cpu().detach().numpy()
  return prediction

In [None]:
prediction_dev = predict(model, rnn_dev_loader)
prediction_dev = np.exp(prediction_dev)
print(prediction_dev)
print(len(prediction_dev))

[[9.7997105e-01 1.0502942e-02 9.5259724e-03]
 [9.6513635e-01 1.5109286e-02 1.9754410e-02]
 [4.3614348e-04 1.0246724e-03 9.9853927e-01]
 ...
 [9.7035313e-01 7.8405086e-03 2.1806464e-02]
 [9.3891108e-01 1.7092953e-02 4.3995999e-02]
 [1.8006451e-01 2.2772828e-01 5.9220713e-01]]
1583


In [None]:
prediction_test = predict(model, rnn_test_loader)
prediction_test = np.exp(prediction_test)
print(prediction_test)
print(len(prediction_test))

[[0.03012835 0.0726084  0.8972632 ]
 [0.00106289 0.00374366 0.9951935 ]
 [0.00321643 0.00705959 0.9897239 ]
 ...
 [0.00139087 0.00499388 0.99361527]
 [0.18806262 0.24549697 0.5664404 ]
 [0.03720106 0.02326983 0.9395291 ]]
3166


In [None]:
def save_output(output, file_path):
  with open(file_path, "wb") as file:
      pickle.dump(output, file)

In [None]:
if True:
  test_path = "drive/MyDrive/DeepLearningProject_Group3/output_for_ensemble/rnn_output_test_word_segmented.pkl"
  save_output(prediction_test, test_path)
  dev_path = "drive/MyDrive/DeepLearningProject_Group3/output_for_ensemble/rnn_output_dev_word_segmented.pkl"
  save_output(prediction_dev, dev_path)