In [1]:
# mounting drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch import optim
import random
from copy import deepcopy
import time
import pprint

In [3]:
print(torch.cuda.is_available())

True


In [4]:
# read in classification dataset and store in dataframe
df = pd.read_csv('/content/gdrive/MyDrive/data/eval_data/masked_tokens/chemistry/chemistry.tsv', sep=' ', index_col=0)

In [5]:
df.head(n=10)

Unnamed: 0,masked_token,masked_line,masked_idx
0,different,studies to evaluate donor substrates that offe...,7
1,dissolution,we have found that the presence of napl and th...,26
2,napl,the composition of pce [MSK] might be signific...,4
3,microbial,it is important to gain a general understandin...,14
4,actual,with a good numerical model the responses of s...,34
5,reactions,one difficulty in modeling the biological [MSK...,6
6,dnapls,because our goal is to stimulate a high pce tr...,13
7,limiting,one critical [MSK] factor for enhanced dnapl d...,2
8,dehalogenation,experimental studies to determine the nature a...,13
9,evolution,directed [MSK] experiments with dhla are curre...,1


In [6]:
train_percent = 0.2
validation_percent = 0.05
test_percent = 0.05

df = df.sample(frac=1)
a = int(len(df)*train_percent)
b = int(a + len(df)*validation_percent)
c = int(b + len(df)*test_percent)

train_data = df[0:a]
validation_data = df[a:b]
test_data = df[b:c]

print(train_data.head())
print(validation_data.head())
print(test_data.head())

       masked_token                                        masked_line  \
224933      protein  the atpase activity of reca p67w [MSK] was mea...   
209649    molecular  all atom [MSK] dynamics md simulations are cap...   
155749     anatomic  we evaluated the biodistribution of 4 in vivo ...   
181333          tbs  on each day of the culture a portion of the ce...   
221798   nitrifying  we recognize that using immunofluorescence to ...   

        masked_idx  
224933           6  
209649           2  
155749          14  
181333          23  
221798           7  
       masked_token                                        masked_line  \
8173         bonded  the components of the cocrystals form 1d hydro...   
148581   industries  mild steel is widely used as the constructiona...   
50048         cells  ganglion [MSK] schwann cells and clusters of a...   
87440      specific  protein samples were analysed by western blot ...   
175835         work  the differences de el de th dpv [MSK] 

In [7]:
class SentenceExample:
    """
    Data wrapper for a single example for sentiment analysis.

    Attributes:
        words (List[string]): list of words
        label (int): 0 or 1 (0 = negative, 1 = positive)
    """

    def __init__(self, words, masked_token, masked_idx):
        self.words = words
        self.masked_token = masked_token
        self.masked_idx = masked_idx

    def __repr__(self):
        return repr(self.words) + "; masked token =" + repr(self.masked_token)

    def __str__(self):
        return self.__repr__()

In [8]:
class Indexer(object):
    """
    Bijection between objects and integers starting at 0. Useful for mapping
    labels, features, etc. into coordinates of a vector space.

    Attributes:
        objs_to_ints
        ints_to_objs
    """
    def __init__(self):
        self.objs_to_ints = {}
        self.ints_to_objs = {}

    def __repr__(self):
        return str([str(self.get_object(i)) for i in range(0, len(self))])

    def __str__(self):
        return self.__repr__()

    def __len__(self):
        return len(self.objs_to_ints)

    def get_object(self, index):
        """
        :param index: integer index to look up
        :return: Returns the object corresponding to the particular index or None if not found
        """
        if (index not in self.ints_to_objs):
            return None
        else:
            return self.ints_to_objs[index]

    def contains(self, object):
        """
        :param object: object to look up
        :return: Returns True if it is in the Indexer, False otherwise
        """
        return self.index_of(object) != -1

    def index_of(self, object):
        """
        :param object: object to look up
        :return: Returns -1 if the object isn't present, index otherwise
        """
        if (object not in self.objs_to_ints):
            return -1
        else:
            return self.objs_to_ints[object]

    def add_and_get_index(self, object, add=True):
        """
        Adds the object to the index if it isn't present, always returns a nonnegative index
        :param object: object to look up or add
        :param add: True by default, False if we shouldn't add the object. If False, equivalent to index_of.
        :return: The index of the object
        """
        if not add:
            return self.index_of(object)
        if (object not in self.objs_to_ints):
            new_idx = len(self.objs_to_ints)
            self.objs_to_ints[object] = new_idx
            self.ints_to_objs[new_idx] = object
        return self.objs_to_ints[object]

In [9]:
class WordEmbeddings:
    """
    Wraps an Indexer and a list of 1-D numpy arrays where each position in the list is the vector for the corresponding
    word in the indexer. The 0 vector is returned if an unknown word is queried.
    """
    def __init__(self, word_indexer, vectors):
        self.word_indexer = word_indexer
        self.vectors = vectors

    def get_embedding_length(self):
        return len(self.vectors[0])

    def get_embedding(self, word):
        """
        Returns the embedding for a given word
        :param word: The word to look up
        :return: The UNK vector if the word is not in the Indexer or the vector otherwise
        """
        word_idx = self.word_indexer.index_of(word)
        if word_idx != -1:
            return self.vectors[word_idx]
        else:
            return self.vectors[self.word_indexer.index_of("UNK")]

In [10]:
def read_word_embeddings(embeddings_file: str) -> WordEmbeddings:
    """
    Loads the given embeddings (ASCII-formatted) into a WordEmbeddings object. Augments this with an UNK embedding
    that is the 0 vector. Reads in all embeddings with no filtering -- you should only use this for relativized
    word embedding files.
    :param embeddings_file: path to the file containing embeddings
    :return: WordEmbeddings object reflecting the words and their embeddings
    """
    f = open(embeddings_file)
    word_indexer = Indexer()
    vectors = []
    # Make position 0 a PAD token, which can be useful if you
    word_indexer.add_and_get_index("PAD")
    # Make position 1 the UNK token
    word_indexer.add_and_get_index("UNK")
    for i, line in enumerate(f):
        if line.strip() != "":
            space_idx = line.find(' ')
            word = line[:space_idx]
            numbers = line[space_idx+1:]
            float_numbers = [float(number_str) for number_str in numbers.split()]
            vector = np.array(float_numbers)
            word_indexer.add_and_get_index(word)
            # Append the PAD and UNK vectors to start. Have to do this weirdly because we need to read the first line
            # of the file to see what the embedding dim is
            if len(vectors) == 0:
                vectors.append(np.zeros(vector.shape[0]))
                vectors.append(np.zeros(vector.shape[0]))
            vectors.append(vector)
        if i % 10000 == 0:
          print(f'done reading {i} embeddings')
    f.close()
    print("Read in " + repr(len(word_indexer)) + " vectors of size " + repr(vectors[0].shape[0]))
    # Turn vectors into a 2-D numpy array
    return WordEmbeddings(word_indexer, np.array(vectors))

In [11]:
embeddings = read_word_embeddings("/content/gdrive/MyDrive/embeddings/glove/baseline/glove.6B.300d.txt")
print(len(embeddings.word_indexer))

done reading 0 embeddings
done reading 10000 embeddings
done reading 20000 embeddings
done reading 30000 embeddings
done reading 40000 embeddings
done reading 50000 embeddings
done reading 60000 embeddings
done reading 70000 embeddings
done reading 80000 embeddings
done reading 90000 embeddings
done reading 100000 embeddings
done reading 110000 embeddings
done reading 120000 embeddings
done reading 130000 embeddings
done reading 140000 embeddings
done reading 150000 embeddings
done reading 160000 embeddings
done reading 170000 embeddings
done reading 180000 embeddings
done reading 190000 embeddings
done reading 200000 embeddings
done reading 210000 embeddings
done reading 220000 embeddings
done reading 230000 embeddings
done reading 240000 embeddings
done reading 250000 embeddings
done reading 260000 embeddings
done reading 270000 embeddings
done reading 280000 embeddings
done reading 290000 embeddings
done reading 300000 embeddings
done reading 310000 embeddings
done reading 320000 em

In [12]:
print(embeddings.word_indexer.index_of('particles'))

9100


In [13]:
train_exs = []
for i, row in train_data.iterrows():
  sentence = row['masked_line'].split(' ')
  sentence[-1] = sentence[-1].strip()
  train_exs.append(SentenceExample(sentence, row['masked_token'], row['masked_idx']))

In [14]:
validation_exs = []
for i, row in validation_data.iterrows():
  sentence = row['masked_line'].split(' ')
  sentence[-1] = sentence[-1].strip()
  validation_exs.append(SentenceExample(sentence, row['masked_token'], row['masked_idx']))

In [15]:
test_exs = []
for i, row in test_data.iterrows():
  sentence = row['masked_line'].split(' ')
  sentence[-1] = sentence[-1].strip()
  test_exs.append(SentenceExample(sentence, row['masked_token'], row['masked_idx']))

In [16]:
class MaskedWordPredictor(nn.Module):
  def __init__(self, 
               embedding_vectors,
               hidden_dim,
               output_dim):
    super().__init__()

    self.embedding_vectors = embedding_vectors
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim

    weights = torch.FloatTensor(self.embedding_vectors)
    self.e1 = nn.Embedding.from_pretrained(weights, padding_idx=0)
    self.e1.requires_grad_(False)

    self.lstm = nn.LSTM(input_size=len(self.embedding_vectors[0]),
                        hidden_size=self.hidden_dim,
                        num_layers=2,
                        batch_first=True,
                        bidirectional=True)
    
    self.drop = nn.Dropout(p=0.3)
    self.fc1 = nn.Linear(2*self.hidden_dim, output_dim)

  def forward(self, x, masked_idxs):

    sentence_mask = (x != 0).type(
            torch.cuda.LongTensor if x.is_cuda else
            torch.LongTensor)
    
    sentence_lengths = sentence_mask.sum(dim=1).cpu()
            
    x = self.e1(x)

    packed_input = pack_padded_sequence(x, sentence_lengths, batch_first=True, enforce_sorted=False)
    packed_output, _ = self.lstm(packed_input)
    output, _ = pad_packed_sequence(packed_output, batch_first=True)    

    masked_embeddings = torch.cat([torch.index_select(a, 0, i) for a, i in zip(output, masked_idxs)])

    x = self.fc1(masked_embeddings)

    return x

In [17]:
class RNNClassifier():
    """
    Implement your NeuralSentimentClassifier here. This should wrap an instance of the network with learned weights
    along with everything needed to run it on new data (word embeddings, etc.)
    """
    def __init__(self,
                 word_embeddings,
                 output_size,
                 batch_size=128,
                 hidden_size=128,
                 lr=0.001,
                 num_epochs=10,
                 seed=3):

        # indexer between words and indexes => self.word_embeddings.word_indexer (indexer in utils)
        # 2D array of weights => self.word_embeddings.vectors (in sentiment_data)
        
        self.word_embeddings = word_embeddings
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.lr = lr
        self.output_size = output_size

        # random.seed(seed)
        # torch.manual_seed(seed)

        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

        self.network = MaskedWordPredictor(embedding_vectors=self.word_embeddings.vectors,
                                           hidden_dim=hidden_size,
                                           output_dim=output_size).to(device)

        self.print_network()

        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    def convert_to_idx_tensor(self, sentences):
        idx_encodings = []
        for sentence in sentences:
            idx_encoding = [0] * len(sentence)
            for idx, word in enumerate(sentence):
                word_idx = self.word_embeddings.word_indexer.index_of(word)
                idx_encoding[idx] = word_idx if word_idx != -1 else 1
            idx_encodings.append(idx_encoding)
        idx_encodings = torch.tensor(idx_encodings)
        return idx_encodings

    def pad_sentences(self, batch):
        max_len = 0
        for sentiment_ex in batch:
            sentence = sentiment_ex.words
            max_len = max(max_len, len(sentence))

        for sentiment_ex in batch:
            sentence = sentiment_ex.words
            sentence += ['PAD'] * (max_len - len(sentence))

        return batch

    def train(self, train_exs, dev_exs):
        
        loss_function = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.network.parameters(), lr=self.lr, weight_decay=0.0001)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.95)

        max_dev_acc = 0
        best_weights = None

        for epoch in range(self.num_epochs):

            self.network.train()
            random.shuffle(train_exs)

            """
            Minibatch stuff
            """
            idx = 0
            minibatch_train_exs = []
            for i in range(0, len(train_exs), self.batch_size):
                minibatch_train_exs.append(train_exs[i:i+self.batch_size])
                self.pad_sentences(minibatch_train_exs[idx])
                idx += 1

            total_loss = 0
            for i, exs in enumerate(minibatch_train_exs):

                train_x = [ex.words for ex in exs]
                train_y = [max(self.word_embeddings.word_indexer.index_of(ex.masked_token), 1) for ex in exs]
                train_masked_idxs = torch.tensor([ex.masked_idx for ex in exs]).to(self.device)

                # convert word sentences to list of indexes and train_y to tensor
                x = self.convert_to_idx_tensor(train_x)
                y = torch.tensor(train_y)

                x = x.to(self.device)
                y = y.to(self.device)

                output = self.network(x, train_masked_idxs)

                loss = loss_function(output, y)
                total_loss += loss
    
                # update model weights
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                print(f'batch {i} complete')

            # calculate validation set accuracy
            golds = [max(self.word_embeddings.word_indexer.index_of(dev_ex.masked_token), 1) for dev_ex in dev_exs]
            predictions = self.predict_all([dev_ex.words for dev_ex in dev_exs], 
                                            torch.tensor([dev_ex.masked_idx for dev_ex in dev_exs]).to(self.device))
            num_correct = 0
            for i in range(0, len(golds)):
                if golds[i] == predictions[i]:
                    num_correct += 1

            dev_acc = num_correct/len(golds)

            if dev_acc > max_dev_acc or epoch == 0:
                max_dev_acc = dev_acc
                best_weights = deepcopy(self.network.state_dict())
                print('saving model weights...')

            scheduler.step()

            print(f'Epoch Number = {epoch}, total loss = ', total_loss)
            print(f'Development Accuracy = {num_correct}/{len(golds)} = {dev_acc}')

        self.network.load_state_dict(best_weights)

    def predict(self, ex_words, masked_idx) -> int:
        idx_tensor = self.convert_to_idx_tensor([ex_words])
        idx_tensor = idx_tensor.to(self.device)
        output = self.network(idx_tensor, masked_idx)
        result = output.argmax(dim=1)
        return result[0]

    def predict_all(self, all_ex_words, dev_masked_idxs):
        """
        You can leave this method with its default implementation, or you can override it to a batched version of
        prediction if you'd like. Since testing only happens once, this is less critical to optimize than training
        for the purposes of this assignment.
        :param all_ex_words: A list of all exs to do prediction on
        :return:
        """
        return [self.predict(ex_words, [masked_idx]) for ex_words, masked_idx in zip(all_ex_words, dev_masked_idxs)]

    def print_network(self):
        print(self.network)

In [18]:
def train_deep_averaging_network(train_exs, 
                                 dev_exs,
                                 word_embeddings: WordEmbeddings) -> RNNClassifier:
    """
    :param args: Command-line args so you can access them here
    :param train_exs: training examples
    :param dev_exs: development set, in case you wish to evaluate your model during training
    :param word_embeddings: set of loaded word embeddings
    :return: A trained NeuralSentimentClassifier model
    """

    # extract input information from args
    batch_size = 128 # args.batch_size
    hidden_size = 128 # args.hidden_size
    lr = 0.001 # args.lr
    num_epochs = 10 # args.num_epochs
    output_size = len(word_embeddings.vectors)

    classifier = RNNClassifier(word_embeddings=word_embeddings,
                               output_size=output_size,
                               batch_size=batch_size, 
                               hidden_size=hidden_size, 
                               lr=lr, 
                               num_epochs=num_epochs)

    start_time = time.time()
    classifier.train(train_exs=train_exs, dev_exs=dev_exs)
    end_time = time.time()
    total_time = int(end_time - start_time)

    print(f'Total time taken => {total_time}s')

    return classifier

In [None]:
model = train_deep_averaging_network(train_exs, validation_exs, embeddings)