In [40]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/NLP_P4

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/NLP_P4


In [41]:
!ls

dev.csv				      spacy_word2vec32.model
gpu_dev.csv			      test.csv
gpu_test.csv			      train.csv
gpu_train.csv			      train.ipynb
nltk_word2vec32.model		      train.py
P4.pdf				      Untitled.ipynb
Part-A_rnnlm-baseline-e10-a67.csv     word2vec128.model
Pipfile				      word2vec16.model
Pipfile.lock			      word2vec24.model
README.md			      word2vec32.model
spacytokenize_PartAWorking_gpu.ipynb  word2vec64.model


In [42]:
import os
import math
import time
import nltk
import random
import spacy
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import init
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from collections import Counter
from tqdm import tqdm
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nltk.download('punkt')
get_pos = spacy.load("en_core_web_sm")
# analyser = SentimentIntensityAnalyzer()
train_data = pd.read_csv('./gpu_train.csv', encoding='latin-1')
dev_data = pd.read_csv('./gpu_dev.csv', encoding='latin-1')
test_data = pd.read_csv('./gpu_test.csv', encoding='latin-1')
data = [train_data, dev_data, test_data]

PATH = './'

EMBED_DIM = 32
HIDDEN_DIM = 16
LAYERS = 1
EPOCHS = 1000
LR = 3e-4
DEVICE = torch.device("cuda:0")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
# def sentiment_analyzer_scores(sentence):
#     return analyser.polarity_scores(sentence)

In [0]:
list_of_sentences = []
def tag_pos(data):
    all_pos = []
    pos_counts = Counter()

    for df in data:
        df_pos = []
        for row in df.iterrows():
            row_pos = []
            for i in range(1, 7):
                parts_of_speech = get_pos(row[1][i])
                list_of_sentences.append([pos.text for pos in parts_of_speech])
                sentence_pos = [pos.pos_ for pos in parts_of_speech]
#                 sentence_pos = [row[1][i], sentence_pos] #temporary
                row_pos.append(sentence_pos)
                pos_counts.update(sentence_pos)
            df_pos.append(row_pos)
        all_pos.append(df_pos)
        
#         
    return all_pos, pos_counts

In [0]:
pos_data = tag_pos(data)

In [46]:
print(list_of_sentences[0])
print(pos_data[0][0][0])

['Rick', 'grew', 'up', 'in', 'a', 'troubled', 'household', '.']
[['PROPN', 'VERB', 'PART', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT'], ['PRON', 'ADV', 'VERB', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'PUNCT', 'CCONJ', 'VERB', 'ADP', 'NOUN', 'PUNCT'], ['PRON', 'VERB', 'ADV', 'ADJ', 'ADP', 'PROPN', 'VERB', 'VERB', 'ADP', 'DET', 'NOUN', 'PUNCT'], ['DET', 'NOUN', 'VERB', 'PRON', 'PART', 'VERB', 'DET', 'ADJ', 'NOUN', 'PUNCT'], ['PRON', 'VERB', 'ADJ', 'ADV', 'PUNCT'], ['PRON', 'VERB', 'DET', 'NOUN', 'PUNCT']]


In [47]:
model = Word2Vec(list_of_sentences, size=EMBED_DIM, min_count=1)
name = 'spacy_word2vec' + str(EMBED_DIM) + '.model'
model.save(name)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [48]:
w2v = 'spacy_word2vec' + str(EMBED_DIM) + '.model'
WORD2VEC = Word2Vec.load(w2v)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
def new_word_tokenize(sentence):
    return [pos.text for pos in get_pos(sentence)]

In [0]:
def get_one_hot(group, row, sentence, word):
#     print((group, row, sentence, word))
    pos = pos_data[0][group][row][sentence][word]
#     pos = pos_data[group][row][sentence][word]
#     print(pos)
    
    return [1 if pos == list(pos_data[1].keys())[i] else 0 for i in range(len(pos_data[1].keys()))]

In [0]:
def create_vector(i, j, row, sentence, word, group):
    return np.array(list(WORD2VEC.wv[word]) + get_one_hot(group, row[0], i-1, j))

In [0]:
def embed(train_data, dev_data, test_data):
    training_data = [] 
    for row in train_data.iterrows():
        sentences = [row[1][0]]
        for i in range(1, 7):
#             print(row[1][i])
            lst = [create_vector(i, j, row, row[1][i], word, 0) \
                   for j, word in enumerate(new_word_tokenize(row[1][i]))]
            sentences.append(lst)
        sentences.append(row[1][7])
        training_data.append(sentences)
#     return training_data, 1, 1
    
    development_data = []
    for row in dev_data.iterrows():
        sentences = [row[1][0]]
        for i in range(1, 7):
            lst = [create_vector(i, j, row, row[1][i], word, 1) \
                   for j, word in enumerate(new_word_tokenize(row[1][i]))]
#             lst = [np.array(list(WORD2VEC.wv[word]) + get_one_hot(1, row[0], i-1, j)) \
#                    for j, word in enumerate(new_word_tokenize(row[1][i]))]
            sentences.append(lst)
        sentences.append(row[1][7])
        development_data.append(sentences)
        
    testing_data = []
    for row in test_data.iterrows():
        sentences = [row[1][0]]
        for i in range(1, 7):
            lst = [create_vector(i, j, row, row[1][i], word, 2) \
                   for j, word in enumerate(new_word_tokenize(row[1][i]))]
#             lst = [np.array(list(WORD2VEC.wv[word]) + get_one_hot(2, row[0], i-1, j)) \
#                    for j, word in enumerate(new_word_tokenize(row[1][i]))]
            sentences.append(lst)
        testing_data.append(sentences)
        
    return training_data, development_data, testing_data

In [0]:
def new_embed(train_data, dev_data, test_data):
    training_data = [] 
    for row in train_data.iterrows():
        pos = [row[1][0]]
        neg = [row[1][0]]
        for i in range(1, 5):
            lst = [create_vector(i, j, row, row[1][i], word, 0) \
                    for j, word in enumerate(new_word_tokenize(row[1][i]))]
            pos.append(lst)
            neg.append(lst)
        if row[1][7] == 1:
            pos.append([np.array(list(WORD2VEC.wv[word]) + get_one_hot(0, row[0], 4, j)) \
                   for j, word in enumerate(new_word_tokenize(row[1][5]))])
            neg.append([np.array(list(WORD2VEC.wv[word]) + get_one_hot(0, row[0], 5, j)) \
                   for j, word in enumerate(new_word_tokenize(row[1][6]))])
        elif row[1][7] == 2:
            pos.append([np.array(list(WORD2VEC.wv[word]) + get_one_hot(0, row[0], 5, j)) \
                   for j, word in enumerate(new_word_tokenize(row[1][6]))])
            neg.append([np.array(list(WORD2VEC.wv[word]) + get_one_hot(0, row[0], 4, j)) \
                   for j, word in enumerate(new_word_tokenize(row[1][5]))])
        pos.append(1)
        neg.append(0)
        training_data.append(pos)
        training_data.append(neg)
    
    development_data = []
    for row in dev_data.iterrows():
        sentences = [row[1][0]]
        for i in range(1, 7):
            lst = [create_vector(i, j, row, row[1][i], word, 1) \
                    for j, word in enumerate(new_word_tokenize(row[1][i]))]
            sentences.append(lst)
        sentences.append(row[1][7] - 1)
        development_data.append(sentences)
        
    testing_data = []
    for row in test_data.iterrows():
        sentences = [row[1][0]]
        for i in range(1, 7):
            lst = [create_vector(i, j, row, row[1][i], word, 2) \
                    for j, word in enumerate(new_word_tokenize(row[1][i]))]
            sentences.append(lst)
        testing_data.append(sentences)
        
    return training_data, development_data, testing_data

In [0]:
training_data, development_data, testing_data = new_embed(data[0], data[1], data[2])

In [0]:
class NSP(nn.Module):
    def __init__(self):
        super(NSP, self).__init__()
        self.beginning = nn.GRU(EMBED_DIM+17, HIDDEN_DIM, LAYERS, batch_first=True, bidirectional=False)
        self.ending = nn.GRU(EMBED_DIM+17, HIDDEN_DIM, LAYERS, batch_first=True, bidirectional=False)
        self.linear = nn.Linear(HIDDEN_DIM, 2)
        self.softmax = nn.LogSoftmax(dim=0)
        self.criterion = nn.NLLLoss()
        self.optimizer = optim.Adam(self.parameters(), lr=LR)
        self.cuda(device=DEVICE)
        
    def setup(self, data):
        input_1 = torch.tensor(np.expand_dims(data[0] + data[1] + data[2] + data[3], axis=0), device=DEVICE, dtype=torch.float)
        input_2 = torch.tensor(np.expand_dims(data[4], axis=0), device=DEVICE, dtype=torch.float)
        return input_1, input_2

    def compute_Loss(self, predicted_vector, gold_label):
        return self.criterion(predicted_vector, gold_label)

    def forward(self, data):
        input_1, input_2 = self.setup(data)
        h_0 = torch.zeros((LAYERS, 1, HIDDEN_DIM), device=DEVICE)
        __, h_n = self.beginning(input_1, h_0)
        output, __ = self.ending(input_2, h_n)
        x = output[0][-1]
        x = self.linear(x)
        x = self.softmax(x)
        return x

In [69]:
print('Initializing Model')
model = NSP()
prev_dev_acc = 0.0
for epoch in range(EPOCHS):
    checkpoint = PATH + '-e' + str((epoch + 1))
    model.train()
    model.optimizer.zero_grad()
    loss = None
    correct = 0
    total = 0
    start_time = time.time()
    print('Training started for epoch {}'.format(epoch + 1))
    random.shuffle(training_data)
    N = len(training_data)
    for index  in tqdm(range(N)):
        model.optimizer.zero_grad()
        sample = training_data[index]
        input_vector = sample[1:6]
        gold_label = sample[6]
        predicted_vector = model(input_vector)
        predicted_label = torch.argmax(predicted_vector)
        correct += int(predicted_label == gold_label)
        total += 1
        loss = model.compute_Loss(predicted_vector.view(1, -1), torch.tensor([gold_label], device=DEVICE))
        loss.backward()
        model.optimizer.step()
    print('Training accuracy for epoch {}: {}'.format(epoch + 1, correct / total))
    correct = 0
    total = 0
    start_time = time.time()
    random.shuffle(development_data)
    N = len(development_data)
    model.eval()
    model.optimizer.zero_grad()
    for index in tqdm(range(N)):
        sample = development_data[index]
        input_1 = sample[1:6]
        input_2 = sample[1:5] + [sample[6]]
        gold_label = sample[7]
        prediction_1 = model(input_1)
        prediction_2 = model(input_2)
        prob_truthful_1 = prediction_1[1]
        prob_false_1 = prediction_1[0]
        prob_truthful_2 = prediction_2[1]
        prob_false_2 = prediction_2[0]
        probs = [prob_truthful_1, prob_false_1, prob_truthful_2, prob_false_2]
        max_index = probs.index(max(probs))
        if max_index == 0 or max_index == 3:
            predicted_label = 0
        if max_index == 1 or max_index == 2:
            predicted_label = 1
        correct += int(predicted_label == gold_label)
        total += 1
    dev_acc = correct / total
    if dev_acc > prev_dev_acc and dev_acc > 0.67:
        prev_dev_acc = dev_acc
        print('New Best Accuracy: {}'.format(dev_acc))
        acc = int(100 * dev_acc)
        torch.save(model.state_dict(), checkpoint + '-a' + str(acc) + '.pt')
    print('Development accuracy for epoch {}: {}'.format(epoch + 1, correct / total))

torch.save(model.state_dict(), PATH + '-final.pt')

  0%|          | 3/2994 [00:00<01:41, 29.52it/s]

Initializing Model
Training started for epoch 1


100%|██████████| 2994/2994 [00:14<00:00, 210.18it/s]
  9%|▊         | 32/374 [00:00<00:01, 311.60it/s]

Training accuracy for epoch 1: 0.5156980627922512


100%|██████████| 374/374 [00:01<00:00, 334.40it/s]
  1%|          | 21/2994 [00:00<00:14, 205.23it/s]

Development accuracy for epoch 1: 0.6149732620320856
Training started for epoch 2


100%|██████████| 2994/2994 [00:14<00:00, 206.85it/s]
  9%|▉         | 34/374 [00:00<00:01, 331.32it/s]

Training accuracy for epoch 2: 0.5297261189044756


100%|██████████| 374/374 [00:01<00:00, 337.54it/s]
  1%|          | 22/2994 [00:00<00:13, 212.56it/s]

Development accuracy for epoch 2: 0.6310160427807486
Training started for epoch 3


100%|██████████| 2994/2994 [00:14<00:00, 211.64it/s]
  9%|▉         | 34/374 [00:00<00:01, 338.11it/s]

Training accuracy for epoch 3: 0.5534402137608551


100%|██████████| 374/374 [00:01<00:00, 340.98it/s]
  1%|          | 21/2994 [00:00<00:14, 209.17it/s]

Development accuracy for epoch 3: 0.6390374331550802
Training started for epoch 4


100%|██████████| 2994/2994 [00:14<00:00, 207.03it/s]
  9%|▉         | 35/374 [00:00<00:00, 346.96it/s]

Training accuracy for epoch 4: 0.5647962591850367


100%|██████████| 374/374 [00:01<00:00, 339.78it/s]
  1%|          | 22/2994 [00:00<00:14, 212.25it/s]

Development accuracy for epoch 4: 0.6256684491978609
Training started for epoch 5


100%|██████████| 2994/2994 [00:14<00:00, 211.59it/s]
  9%|▉         | 35/374 [00:00<00:00, 346.21it/s]

Training accuracy for epoch 5: 0.5801603206412825


100%|██████████| 374/374 [00:01<00:00, 341.66it/s]
  1%|          | 22/2994 [00:00<00:14, 211.72it/s]

Development accuracy for epoch 5: 0.6256684491978609
Training started for epoch 6


100%|██████████| 2994/2994 [00:14<00:00, 210.89it/s]
  9%|▉         | 35/374 [00:00<00:00, 341.97it/s]

Training accuracy for epoch 6: 0.5878423513694054


100%|██████████| 374/374 [00:01<00:00, 333.89it/s]
  1%|          | 22/2994 [00:00<00:13, 215.17it/s]

Development accuracy for epoch 6: 0.6203208556149733
Training started for epoch 7


100%|██████████| 2994/2994 [00:14<00:00, 213.39it/s]
  9%|▊         | 32/374 [00:00<00:01, 317.29it/s]

Training accuracy for epoch 7: 0.5881763527054108


100%|██████████| 374/374 [00:01<00:00, 339.97it/s]
  1%|          | 22/2994 [00:00<00:13, 216.87it/s]

Development accuracy for epoch 7: 0.6336898395721925
Training started for epoch 8


100%|██████████| 2994/2994 [00:14<00:00, 213.18it/s]
  9%|▉         | 34/374 [00:00<00:01, 331.95it/s]

Training accuracy for epoch 8: 0.5968603874415498


100%|██████████| 374/374 [00:01<00:00, 341.20it/s]
  1%|          | 22/2994 [00:00<00:13, 216.32it/s]

Development accuracy for epoch 8: 0.6363636363636364
Training started for epoch 9


100%|██████████| 2994/2994 [00:13<00:00, 214.19it/s]
  9%|▉         | 35/374 [00:00<00:00, 343.38it/s]

Training accuracy for epoch 9: 0.5988643954575819


100%|██████████| 374/374 [00:01<00:00, 338.37it/s]
  1%|          | 21/2994 [00:00<00:14, 205.75it/s]

Development accuracy for epoch 9: 0.6470588235294118
Training started for epoch 10


100%|██████████| 2994/2994 [00:13<00:00, 213.92it/s]
 10%|▉         | 36/374 [00:00<00:00, 353.70it/s]

Training accuracy for epoch 10: 0.6045424181696727


100%|██████████| 374/374 [00:01<00:00, 342.29it/s]
  1%|          | 22/2994 [00:00<00:13, 218.32it/s]

Development accuracy for epoch 10: 0.6577540106951871
Training started for epoch 11


100%|██████████| 2994/2994 [00:13<00:00, 216.77it/s]
  9%|▉         | 34/374 [00:00<00:01, 333.82it/s]

Training accuracy for epoch 11: 0.6052104208416834


100%|██████████| 374/374 [00:01<00:00, 345.50it/s]
  1%|          | 21/2994 [00:00<00:14, 207.48it/s]

Development accuracy for epoch 11: 0.660427807486631
Training started for epoch 12


100%|██████████| 2994/2994 [00:13<00:00, 215.93it/s]
 10%|▉         | 36/374 [00:00<00:00, 350.09it/s]

Training accuracy for epoch 12: 0.614562458249833


100%|██████████| 374/374 [00:01<00:00, 344.40it/s]
  1%|          | 22/2994 [00:00<00:13, 217.84it/s]

Development accuracy for epoch 12: 0.660427807486631
Training started for epoch 13


100%|██████████| 2994/2994 [00:13<00:00, 216.45it/s]
 10%|▉         | 36/374 [00:00<00:00, 359.75it/s]

Training accuracy for epoch 13: 0.6169004676018705


100%|██████████| 374/374 [00:01<00:00, 350.33it/s]
  1%|          | 21/2994 [00:00<00:14, 205.91it/s]

Development accuracy for epoch 13: 0.6684491978609626
Training started for epoch 14


100%|██████████| 2994/2994 [00:13<00:00, 215.81it/s]
 10%|▉         | 36/374 [00:00<00:00, 357.55it/s]

Training accuracy for epoch 14: 0.614562458249833


100%|██████████| 374/374 [00:01<00:00, 342.00it/s]
  1%|          | 20/2994 [00:00<00:15, 194.77it/s]

Development accuracy for epoch 14: 0.6657754010695187
Training started for epoch 15


100%|██████████| 2994/2994 [00:14<00:00, 211.62it/s]
  9%|▉         | 35/374 [00:00<00:00, 340.78it/s]

Training accuracy for epoch 15: 0.6212424849699398


100%|██████████| 374/374 [00:01<00:00, 339.17it/s]
  1%|          | 23/2994 [00:00<00:13, 226.05it/s]

Development accuracy for epoch 15: 0.6657754010695187
Training started for epoch 16


100%|██████████| 2994/2994 [00:14<00:00, 207.42it/s]
  9%|▉         | 33/374 [00:00<00:01, 329.32it/s]

Training accuracy for epoch 16: 0.6235804943219773


100%|██████████| 374/374 [00:01<00:00, 340.12it/s]
  1%|          | 22/2994 [00:00<00:13, 212.83it/s]

New Best Accuracy: 0.6818181818181818
Development accuracy for epoch 16: 0.6818181818181818
Training started for epoch 17


100%|██████████| 2994/2994 [00:13<00:00, 216.01it/s]
  9%|▉         | 35/374 [00:00<00:00, 348.87it/s]

Training accuracy for epoch 17: 0.6269205076820308


100%|██████████| 374/374 [00:01<00:00, 331.13it/s]
  1%|          | 21/2994 [00:00<00:14, 203.78it/s]

Development accuracy for epoch 17: 0.6764705882352942
Training started for epoch 18


100%|██████████| 2994/2994 [00:13<00:00, 217.78it/s]
 10%|▉         | 36/374 [00:00<00:00, 354.29it/s]

Training accuracy for epoch 18: 0.6255845023380093


100%|██████████| 374/374 [00:01<00:00, 349.94it/s]
  1%|          | 23/2994 [00:00<00:13, 222.67it/s]

Development accuracy for epoch 18: 0.6711229946524064
Training started for epoch 19


100%|██████████| 2994/2994 [00:13<00:00, 216.84it/s]
  9%|▉         | 35/374 [00:00<00:00, 342.79it/s]

Training accuracy for epoch 19: 0.6315965263861055


100%|██████████| 374/374 [00:01<00:00, 347.48it/s]
  1%|          | 23/2994 [00:00<00:13, 221.26it/s]

Development accuracy for epoch 19: 0.6684491978609626
Training started for epoch 20


100%|██████████| 2994/2994 [00:13<00:00, 216.45it/s]
 10%|▉         | 36/374 [00:00<00:00, 350.42it/s]

Training accuracy for epoch 20: 0.6429525718102872


100%|██████████| 374/374 [00:01<00:00, 345.70it/s]
  1%|          | 22/2994 [00:00<00:13, 214.05it/s]

Development accuracy for epoch 20: 0.6684491978609626
Training started for epoch 21


100%|██████████| 2994/2994 [00:13<00:00, 218.79it/s]
 10%|▉         | 37/374 [00:00<00:00, 369.67it/s]

Training accuracy for epoch 21: 0.6419505678022712


100%|██████████| 374/374 [00:01<00:00, 340.17it/s]
  1%|          | 21/2994 [00:00<00:14, 203.40it/s]

Development accuracy for epoch 21: 0.6657754010695187
Training started for epoch 22


100%|██████████| 2994/2994 [00:13<00:00, 217.31it/s]
 10%|▉         | 36/374 [00:00<00:00, 355.37it/s]

Training accuracy for epoch 22: 0.6409485637942551


100%|██████████| 374/374 [00:01<00:00, 351.23it/s]
  1%|          | 23/2994 [00:00<00:13, 221.72it/s]

Development accuracy for epoch 22: 0.6737967914438503
Training started for epoch 23


100%|██████████| 2994/2994 [00:13<00:00, 218.55it/s]
  9%|▉         | 35/374 [00:00<00:00, 347.71it/s]

Training accuracy for epoch 23: 0.6429525718102872


100%|██████████| 374/374 [00:01<00:00, 351.60it/s]
  1%|          | 23/2994 [00:00<00:13, 225.37it/s]

Development accuracy for epoch 23: 0.6737967914438503
Training started for epoch 24


100%|██████████| 2994/2994 [00:14<00:00, 212.55it/s]
  9%|▉         | 35/374 [00:00<00:00, 348.32it/s]

Training accuracy for epoch 24: 0.6499665998663995


100%|██████████| 374/374 [00:01<00:00, 350.47it/s]
  1%|          | 21/2994 [00:00<00:14, 206.82it/s]

Development accuracy for epoch 24: 0.6631016042780749
Training started for epoch 25


100%|██████████| 2994/2994 [00:13<00:00, 216.33it/s]
 10%|▉         | 37/374 [00:00<00:00, 362.00it/s]

Training accuracy for epoch 25: 0.654308617234469


100%|██████████| 374/374 [00:01<00:00, 345.74it/s]
  1%|          | 21/2994 [00:00<00:14, 205.27it/s]

Development accuracy for epoch 25: 0.6577540106951871
Training started for epoch 26


100%|██████████| 2994/2994 [00:13<00:00, 215.65it/s]
 10%|▉         | 37/374 [00:00<00:00, 363.69it/s]

Training accuracy for epoch 26: 0.6492985971943888


100%|██████████| 374/374 [00:01<00:00, 351.64it/s]
  1%|          | 23/2994 [00:00<00:13, 226.86it/s]

Development accuracy for epoch 26: 0.6550802139037433
Training started for epoch 27


 70%|██████▉   | 2090/2994 [00:09<00:04, 219.56it/s]


KeyboardInterrupt: ignored