In [1]:
# mounting drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch import optim
import random
from copy import deepcopy
import time
import pprint
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
print(torch.cuda.is_available())

False


In [5]:
TOPIC = 'polisci'
DIMENSIONS = 200

In [6]:
# read in classification dataset and store in dataframe
df = pd.read_csv(f'/content/gdrive/MyDrive/data/eval_data/outlier_detection/{TOPIC}/{TOPIC}.tsv', sep=' ', index_col=0)

In [7]:
df.head(n=10)

Unnamed: 0,s0,s1,s2,s3,outlier
0,the overall aim of this project was threefold\n,by organizing workshops at the science policy ...,raise awareness for climate change in turkish ...,a survey research design was used\n,3
1,older australians lack guidance from and recou...,consequently they lack identity as rights hold...,thank you for downloading social forces in the...,accordingly as individuals they are vulnerable...,2
2,the government of india has presented an expan...,the key to its success say dozens of people wh...,the awc has treatment facilities in the surges...,taking this vision forward the rashtrapati bha...,1
3,it is made the quantitative assessment of the ...,its visibility grew significantly arousing spo...,from a sample of mails sent to the judges of t...,the constitutional council conquered during th...,0
4,report from the norwegian scientific committee...,microplastics occurrence levels and implicatio...,opinion of the steering committee of the norwe...,a political approach the organization for secu...,3
5,in july 2016 the law on temporary limitations ...,the temporary law implied a sharp turn in swed...,that such a large segment of ireland s populat...,the right to family reunification which previo...,2
6,elite analysis has presented a major challenge...,it was a period of enlightenment marked by int...,it is natural that one of the many subjects th...,women s movement in india emerged as a part of...,0
7,since the 1980s globalization has led to a new...,studying american politics in america s century\n,today globalization is accepted as an umbrella...,it has exhibited a monolithic order on the one...,1
8,trends in female migration in latin america ar...,consideration is given to the causes of this m...,data are from a survey of migrant origin carri...,in this age of electronic information we live ...,3
9,a movement has ignited amongst government agen...,these projects face challenges related to scal...,but new sociotechnical approaches that leverag...,we then show how the failure of this initial r...,3


In [8]:
train_percent = 0.8
validation_percent = 0.04
test_percent = 0.04

df = df.sample(frac=1)
a = int(len(df)*train_percent)
b = int(a + len(df)*validation_percent)
c = int(b + len(df)*test_percent)

train_data = df[0:a]
validation_data = df[a:b]
test_data = df[b:c]

print(train_data.head())
print(validation_data.head())
print(test_data.head())

                                                      s0  \
36444  this is our introduction to the forthcoming bo...   
42291  the mission of wageningen ur university resear...   
35545  in this research a set of questionnaires consi...   
24531  strategic communication disciplines routinely ...   
52282  the development of agricultural engineering di...   

                                                      s1  \
36444  in the introduction we explain the aim and met...   
42291  within wageningen ur nine specialised research...   
35545  it highlights the place of the hungarian refor...   
24531  i argue here that a better understanding of cl...   
52282  it concerns the complexity of migration moveme...   

                                                      s2  \
36444  the researcher finds out the awareness level o...   
42291  because of the importance of international stu...   
35545  calvinism attracted strong support in hungary ...   
24531  as u s involvement in the wars 

In [9]:
class SentenceExample:
    """
    Data wrapper for a single example for sentiment analysis.

    Attributes:
        words (List[string]): list of words
        label (int): 0 or 1 (0 = negative, 1 = positive)
    """

    def __init__(self, s1, s2, s3, s4, outlier):
        self.s1 = s1
        self.s2 = s2
        self.s3 = s3
        self.s4 = s4
        self.outlier = outlier

    def __repr__(self):
        return f'outlier = {self.outlier}'

    def __str__(self):
        return self.__repr__()

In [10]:
class Indexer(object):
    """
    Bijection between objects and integers starting at 0. Useful for mapping
    labels, features, etc. into coordinates of a vector space.

    Attributes:
        objs_to_ints
        ints_to_objs
    """
    def __init__(self):
        self.objs_to_ints = {}
        self.ints_to_objs = {}

    def __repr__(self):
        return str([str(self.get_object(i)) for i in range(0, len(self))])

    def __str__(self):
        return self.__repr__()

    def __len__(self):
        return len(self.objs_to_ints)

    def get_object(self, index):
        """
        :param index: integer index to look up
        :return: Returns the object corresponding to the particular index or None if not found
        """
        if (index not in self.ints_to_objs):
            return None
        else:
            return self.ints_to_objs[index]

    def contains(self, object):
        """
        :param object: object to look up
        :return: Returns True if it is in the Indexer, False otherwise
        """
        return self.index_of(object) != -1

    def index_of(self, object):
        """
        :param object: object to look up
        :return: Returns -1 if the object isn't present, index otherwise
        """
        if (object not in self.objs_to_ints):
            return -1
        else:
            return self.objs_to_ints[object]

    def add_and_get_index(self, object, add=True):
        """
        Adds the object to the index if it isn't present, always returns a nonnegative index
        :param object: object to look up or add
        :param add: True by default, False if we shouldn't add the object. If False, equivalent to index_of.
        :return: The index of the object
        """
        if not add:
            return self.index_of(object)
        if (object not in self.objs_to_ints):
            new_idx = len(self.objs_to_ints)
            self.objs_to_ints[object] = new_idx
            self.ints_to_objs[new_idx] = object
        return self.objs_to_ints[object]

In [11]:
class WordEmbeddings:
    """
    Wraps an Indexer and a list of 1-D numpy arrays where each position in the list is the vector for the corresponding
    word in the indexer. The 0 vector is returned if an unknown word is queried.
    """
    def __init__(self, word_indexer, vectors):
        self.word_indexer = word_indexer
        self.vectors = vectors

    def get_embedding_length(self):
        return len(self.vectors[0])

    def get_embedding(self, word):
        """
        Returns the embedding for a given word
        :param word: The word to look up
        :return: The UNK vector if the word is not in the Indexer or the vector otherwise
        """
        word_idx = self.word_indexer.index_of(word)
        if word_idx != -1:
            return self.vectors[word_idx]
        else:
            return self.vectors[self.word_indexer.index_of("UNK")]

In [12]:
CUSTOM_EMBEDDINGS = False

In [13]:
def read_word_embeddings(embeddings_file: str) -> WordEmbeddings:
    """
    Loads the given embeddings (ASCII-formatted) into a WordEmbeddings object. Augments this with an UNK embedding
    that is the 0 vector. Reads in all embeddings with no filtering -- you should only use this for relativized
    word embedding files.
    :param embeddings_file: path to the file containing embeddings
    :return: WordEmbeddings object reflecting the words and their embeddings
    """
    f = open(embeddings_file)
    word_indexer = Indexer()
    vectors = []
    # Make position 0 a PAD token, which can be useful if you
    word_indexer.add_and_get_index("PAD")
    # Make position 1 the UNK token
    word_indexer.add_and_get_index("UNK")
    for i, line in enumerate(f):
        if CUSTOM_EMBEDDINGS and i == 0:
          continue

        if line.strip() != "":
            space_idx = line.find(' ')
            word = line[:space_idx]
            numbers = line[space_idx+1:]
            float_numbers = [float(number_str) for number_str in numbers.split()]
            vector = np.array(float_numbers)
            word_indexer.add_and_get_index(word)
            # Append the PAD and UNK vectors to start. Have to do this weirdly because we need to read the first line
            # of the file to see what the embedding dim is
            if len(vectors) == 0:
                vectors.append(np.zeros(vector.shape[0]))
                vectors.append(np.zeros(vector.shape[0]))
            vectors.append(vector)
        if i % 50000 == 0:
          print(f'done reading {i} embeddings')
    f.close()
    print("Read in " + repr(len(word_indexer)) + " vectors of size " + repr(vectors[0].shape[0]))
    # Turn vectors into a 2-D numpy array
    return WordEmbeddings(word_indexer, np.array(vectors))

In [None]:
embeddings = read_word_embeddings(f"/content/gdrive/MyDrive/embeddings/glove/baseline/glove.6B.{DIMENSIONS}d.txt")
#embeddings = read_word_embeddings(f"/content/gdrive/MyDrive/embeddings/word2vec/baseline/word2vec_embeddings_{TOPIC}.txt")
print(len(embeddings.word_indexer))

done reading 0 embeddings
done reading 50000 embeddings
done reading 100000 embeddings


In [None]:
print(embeddings.word_indexer.index_of('super'))

In [None]:
train_exs = []
for i, row in train_data.iterrows():

  s1 = word_tokenize(row['s0'])
  s1[-1] = s1[-1].strip()

  s2 = word_tokenize(row['s1'])
  s2[-1] = s2[-1].strip()

  s3 = word_tokenize(row['s2'])
  s3[-1] = s3[-1].strip()

  s4 = word_tokenize(row['s3'])
  s4[-1] = s4[-1].strip()
  
  train_exs.append(SentenceExample(s1, s2, s3, s4, row['outlier']))

In [None]:
print(len(train_exs))

In [None]:
validation_exs = []
for i, row in validation_data.iterrows():
  s1 = word_tokenize(row['s0'])
  s1[-1] = s1[-1].strip()

  s2 = word_tokenize(row['s1'])
  s2[-1] = s2[-1].strip()

  s3 = word_tokenize(row['s2'])
  s3[-1] = s3[-1].strip()

  s4 = word_tokenize(row['s3'])
  s4[-1] = s4[-1].strip()
  
  validation_exs.append(SentenceExample(s1, s2, s3, s4, row['outlier']))

In [None]:
test_exs = []
for i, row in test_data.iterrows():
  s1 = word_tokenize(row['s0'])
  s1[-1] = s1[-1].strip()

  s2 = word_tokenize(row['s1'])
  s2[-1] = s2[-1].strip()

  s3 = word_tokenize(row['s2'])
  s3[-1] = s3[-1].strip()

  s4 = word_tokenize(row['s3'])
  s4[-1] = s4[-1].strip()
  
  test_exs.append(SentenceExample(s1, s2, s3, s4, row['outlier']))

In [None]:
class RNN(nn.Module):
  def __init__(self, 
               embedding_vectors,
               hidden_dim,
               output_dim):
    super().__init__()

    self.embedding_vectors = embedding_vectors
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim

    weights = torch.FloatTensor(self.embedding_vectors)
    self.e1 = nn.Embedding.from_pretrained(weights, padding_idx=0)
    self.e1.requires_grad_(False)

    self.gru_s1 = nn.GRU(input_size=len(self.embedding_vectors[0]),
                         hidden_size=self.hidden_dim,
                         num_layers=2,
                         batch_first=True,
                         bidirectional=False)
    
    self.gru_s2 = nn.GRU(input_size=len(self.embedding_vectors[0]),
                         hidden_size=self.hidden_dim,
                         num_layers=2,
                         batch_first=True,
                         bidirectional=False)
    
    self.gru_s3 = nn.GRU(input_size=len(self.embedding_vectors[0]),
                         hidden_size=self.hidden_dim,
                         num_layers=2,
                         batch_first=True,
                         bidirectional=False)
    
    self.gru_s4 = nn.GRU(input_size=len(self.embedding_vectors[0]),
                         hidden_size=self.hidden_dim,
                         num_layers=2,
                         batch_first=True,
                         bidirectional=False)
    
    self.drop = nn.Dropout(p=0.3)

    self.l1 = nn.Linear(4*self.hidden_dim, self.hidden_dim)
    self.r1 = nn.ReLU()
    self.l2 = nn.Linear(self.hidden_dim, self.output_dim)

    nn.init.xavier_uniform_(self.l1.weight)
    nn.init.xavier_uniform_(self.l2.weight)

  def forward(self, s1, s2, s3, s4):

    s1_mask = (s1 != 0).type(
            torch.cuda.LongTensor if s1.is_cuda else
            torch.LongTensor)
    
    s1_lengths = s1_mask.sum(dim=1).cpu()

    s2_mask = (s2 != 0).type(
            torch.cuda.LongTensor if s2.is_cuda else
            torch.LongTensor)
    
    s2_lengths = s2_mask.sum(dim=1).cpu()
    
    s3_mask = (s3 != 0).type(
            torch.cuda.LongTensor if s3.is_cuda else
            torch.LongTensor)
    
    s3_lengths = s3_mask.sum(dim=1).cpu()

    s4_mask = (s4 != 0).type(
            torch.cuda.LongTensor if s4.is_cuda else
            torch.LongTensor)
    
    s4_lengths = s4_mask.sum(dim=1).cpu()
            
    s1 = self.e1(s1)
    s2 = self.e1(s2)
    s3 = self.e1(s3)
    s4 = self.e1(s4)

    packed_input_s1 = pack_padded_sequence(s1, s1_lengths, batch_first=True, enforce_sorted=False)
    packed_output_s1, _ = self.gru_s1(packed_input_s1)
    output_s1, _ = pad_packed_sequence(packed_output_s1, batch_first=True)
    out_s1 = output_s1[range(len(output_s1)), s1_lengths - 1, :self.hidden_dim]
    text_fea_s1 = self.drop(out_s1)

    packed_input_s2 = pack_padded_sequence(s2, s2_lengths, batch_first=True, enforce_sorted=False)
    packed_output_s2, _ = self.gru_s2(packed_input_s2)
    output_s2, _ = pad_packed_sequence(packed_output_s2, batch_first=True)
    out_s2 = output_s2[range(len(output_s2)), s2_lengths - 1, :self.hidden_dim]
    text_fea_s2 = self.drop(out_s2)

    packed_input_s3 = pack_padded_sequence(s3, s3_lengths, batch_first=True, enforce_sorted=False)
    packed_output_s3, _ = self.gru_s3(packed_input_s3)
    output_s3, _ = pad_packed_sequence(packed_output_s3, batch_first=True)
    out_s3 = output_s3[range(len(output_s3)), s3_lengths - 1, :self.hidden_dim]
    text_fea_s3 = self.drop(out_s3)

    packed_input_s4 = pack_padded_sequence(s4, s4_lengths, batch_first=True, enforce_sorted=False)
    packed_output_s4, _ = self.gru_s4(packed_input_s4)
    output_s4, _ = pad_packed_sequence(packed_output_s4, batch_first=True)
    out_s4 = output_s4[range(len(output_s4)), s4_lengths - 1, :self.hidden_dim]
    text_fea_s4 = self.drop(out_s4)

    input_fea = torch.cat((text_fea_s1, text_fea_s2, text_fea_s3, text_fea_s4), dim=1)

    x = self.l1(input_fea)
    x = self.r1(x)
    x = self.l2(x)

    return x

In [None]:
class RNNClassifier():
    """
    Implement your NeuralSentimentClassifier here. This should wrap an instance of the network with learned weights
    along with everything needed to run it on new data (word embeddings, etc.)
    """
    def __init__(self,
                 word_embeddings,
                 batch_size=256,
                 hidden_size=128,
                 output_size=5,
                 lr=0.001,
                 num_epochs=10,
                 seed=3):

        # indexer between words and indexes => self.word_embeddings.word_indexer (indexer in utils)
        # 2D array of weights => self.word_embeddings.vectors (in sentiment_data)
        
        self.word_embeddings = word_embeddings
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.lr = lr
        self.output_size = output_size

        # random.seed(seed)
        # torch.manual_seed(seed)

        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

        self.network = RNN(embedding_vectors=self.word_embeddings.vectors,
                           hidden_dim=hidden_size,
                           output_dim=output_size).to(self.device)

        self.print_network()

    def convert_to_idx_tensor(self, sentences):
        idx_encodings = []
        for sentence in sentences:
            idx_encoding = [0] * len(sentence)
            for idx, word in enumerate(sentence):
                word_idx = self.word_embeddings.word_indexer.index_of(word)
                idx_encoding[idx] = word_idx if word_idx != -1 else 1
            idx_encodings.append(idx_encoding)
        idx_encodings = torch.tensor(idx_encodings)
        return idx_encodings

    def pad_sentences(self, batch):
        max_len = 0
        for sentiment_ex in batch:
            sentence_s1 = sentiment_ex.s1
            max_len = max(max_len, len(sentence_s1))
        for sentiment_ex in batch:
            sentence_s1 = sentiment_ex.s1
            sentence_s1 += ['PAD'] * (max_len - len(sentence_s1))

        max_len = 0
        for sentiment_ex in batch:
            sentence_s2 = sentiment_ex.s2
            max_len = max(max_len, len(sentence_s2))
        for sentiment_ex in batch:
            sentence_s2 = sentiment_ex.s2
            sentence_s2 += ['PAD'] * (max_len - len(sentence_s2))

        max_len = 0
        for sentiment_ex in batch:
            sentence_s3 = sentiment_ex.s3
            max_len = max(max_len, len(sentence_s3))
        for sentiment_ex in batch:
            sentence_s3 = sentiment_ex.s3
            sentence_s3 += ['PAD'] * (max_len - len(sentence_s3))

        max_len = 0
        for sentiment_ex in batch:
            sentence_s4 = sentiment_ex.s4
            max_len = max(max_len, len(sentence_s4))
        for sentiment_ex in batch:
            sentence_s4 = sentiment_ex.s4
            sentence_s4 += ['PAD'] * (max_len - len(sentence_s4))

        return batch

    def train(self, train_exs, dev_exs):
        
        loss_function = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.network.parameters(), lr=self.lr, weight_decay=0.0001)

        max_dev_acc = 0
        best_weights = None

        for epoch in range(self.num_epochs):

            self.network.train()
            random.shuffle(train_exs)

            """
            Minibatch stuff
            """
            idx = 0
            minibatch_train_exs = []
            for i in range(0, len(train_exs), self.batch_size):
                minibatch_train_exs.append(train_exs[i:i+self.batch_size])
                self.pad_sentences(minibatch_train_exs[idx])
                idx += 1

            total_loss = 0
            for i, exs in enumerate(minibatch_train_exs):

                train_s1 = [ex.s1 for ex in exs]
                train_s2 = [ex.s2 for ex in exs]
                train_s3 = [ex.s3 for ex in exs]
                train_s4 = [ex.s4 for ex in exs]
                train_y = [ex.outlier for ex in exs]

                # convert word sentences to list of indexes and train_y to tensor
                train_s1 = self.convert_to_idx_tensor(train_s1)
                train_s2 = self.convert_to_idx_tensor(train_s2)
                train_s3 = self.convert_to_idx_tensor(train_s3)
                train_s4 = self.convert_to_idx_tensor(train_s4)
                y = torch.tensor(train_y)

                train_s1 = train_s1.to(self.device)
                train_s2 = train_s2.to(self.device)
                train_s3 = train_s3.to(self.device)
                train_s4 = train_s4.to(self.device)
                y = y.to(self.device)

                output = self.network(train_s1, train_s2, train_s3, train_s4)

                loss = loss_function(output, y)
                total_loss += loss
    
                # update model weights
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if i % 50 == 0:
                  print(f'batch {i} complete')

            # calculate validation set accuracy
            validation_s1 = [ex.s1 for ex in dev_exs]
            validation_s2 = [ex.s2 for ex in dev_exs]
            validation_s3 = [ex.s3 for ex in dev_exs]
            validation_s4 = [ex.s4 for ex in dev_exs]
            golds = [ex.outlier for ex in dev_exs]
            predictions = self.predict_all(validation_s1, validation_s2, validation_s3, validation_s4)
            num_correct = 0
            for i in range(0, len(golds)):
              if golds[i] == predictions[i]:
                num_correct += 1

            dev_acc = num_correct/len(golds)

            if dev_acc > max_dev_acc or epoch == 0:
              max_dev_acc = dev_acc
              best_weights = deepcopy(self.network.state_dict())
              print('saving model weights...')

            print(f'Epoch Number = {epoch}, total loss = ', total_loss)
            print(f'Development Accuracy = {num_correct}/{len(golds)} = {dev_acc}')

        self.network.load_state_dict(best_weights)

    def predict(self, q1, q2, q3, q4) -> int:

        q1 = self.convert_to_idx_tensor([q1])
        q2 = self.convert_to_idx_tensor([q2])
        q3 = self.convert_to_idx_tensor([q3])
        q4 = self.convert_to_idx_tensor([q4])

        q1 = q1.to(self.device)
        q2 = q2.to(self.device)
        q3 = q3.to(self.device)
        q4 = q4.to(self.device)

        output = self.network(q1, q2, q3, q4)
        result = output.argmax(dim=1)
        return result[0]

    def predict_all(self, s1, s2, s3, s4):
        """
        You can leave this method with its default implementation, or you can override it to a batched version of
        prediction if you'd like. Since testing only happens once, this is less critical to optimize than training
        for the purposes of this assignment.
        :param all_ex_words: A list of all exs to do prediction on
        :return:
        """
        return [self.predict(q1, q2, q3, q4) for q1, q2, q3, q4 in zip(s1, s2, s3, s4)]

    def print_network(self):
        print(self.network)

In [None]:
def train_deep_averaging_network(train_exs, 
                                 dev_exs,
                                 word_embeddings: WordEmbeddings) -> RNNClassifier:
    """
    :param args: Command-line args so you can access them here
    :param train_exs: training examples
    :param dev_exs: development set, in case you wish to evaluate your model during training
    :param word_embeddings: set of loaded word embeddings
    :return: A trained NeuralSentimentClassifier model
    """

    # extract input information from args
    batch_size = 128 # args.batch_size
    hidden_size = 256 # args.hidden_size
    lr = 0.001 # args.lr
    num_epochs = 5 # args.num_epochs
    output_size = 4

    classifier = RNNClassifier(word_embeddings=word_embeddings,
                               batch_size=batch_size, 
                               hidden_size=hidden_size, 
                               output_size=output_size,
                               lr=lr, 
                               num_epochs=num_epochs)

    start_time = time.time()
    classifier.train(train_exs=train_exs, dev_exs=dev_exs)
    end_time = time.time()
    total_time = int(end_time - start_time)

    print(f'Total time taken => {total_time}s')

    return classifier

In [None]:
model = train_deep_averaging_network(train_exs, validation_exs, embeddings)

In [None]:
torch.save(model.network, f'/content/gdrive/MyDrive/models/outlier_detection/outlier_detection.word2vec.{TOPIC}.{DIMENSIONS}d.pt')