In [None]:
import spacy
get_pos = spacy.load("en_core_web_sm")
from gensim.models import Word2Vec
from collections import Counter
import pandas as pd
from nltk.tokenize import word_tokenize
import torch
import numpy as np
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
import time
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()


train_data = pd.read_csv('./train.csv', encoding='latin-1')
dev_data = pd.read_csv('./dev.csv', encoding='latin-1')
test_data = pd.read_csv('./test.csv', encoding='latin-1')

EMBED_DIM = 32
HIDDEN_DIM = 16

LAYERS = 1
EPOCHS = 1000
LR = 3e-4

data = [train_data, dev_data, test_data]

In [99]:
def get_sentiment_scores(sentence):
    '''
    [negative, neutral, positive, compound]
    '''
    return [analyser.polarity_scores(sentence)[sentiment] \
            for sentiment in analyser.polarity_scores(sentence).keys()]

In [101]:
# data = replace_sparse_words(train_data, dev_data, test_data)

In [240]:
list_of_sentences = []
def tag_pos(data):
    all_pos = []
    pos_counts = Counter()

    for df in data:
        df_pos = []
        for row in df.iterrows():
            row_pos = []
            for i in range(1, 7):
                parts_of_speech = get_pos(row[1][i])
                list_of_sentences.append([pos.text for pos in parts_of_speech])
                sentence_pos = [pos.pos_ for pos in parts_of_speech]
#                 sentence_pos = [row[1][i], sentence_pos] #temporary
                row_pos.append(sentence_pos)
                pos_counts.update(sentence_pos)
            df_pos.append(row_pos)
        all_pos.append(df_pos)
        
#         
    return all_pos, pos_counts

In [241]:
pos_data = tag_pos(data)

In [243]:
with open('save.txt', 'w') as file:
    file.write(str(pos_data))

In [214]:
# model = Word2Vec(list_of_sentences, size=EMBED_DIM, min_count=1)
# name = 'new_word2vec' + str(EMBED_DIM) + '.model'
# model.save(name)

w2v = 'new_word2vec' + str(EMBED_DIM) + '.model'
WORD2VEC = Word2Vec.load(w2v)

In [245]:
pos_data[0][0][0][0][0]

'PROPN'

In [247]:
def get_one_hot(group, row, sentence, word):
#     print((group, row, sentence, word))
    pos = pos_data[0][group][row][sentence][word]
#     pos = pos_data[group][row][sentence][word]
#     print(pos)
    
    return [1 if pos == list(pos_data[1].keys())[i] else 0 for i in range(len(pos_data[1].keys()))]

In [253]:
def new_word_tokenize(sentence):
    return [pos.text for pos in get_pos(sentence)]

In [260]:
def create_vector(i, j, row, sentence, word, group):
    return np.array(list(WORD2VEC.wv[word]) + get_one_hot(group, row[0], i-1, j))

In [261]:
def embed(train_data, dev_data, test_data):
    training_data = [] 
    for row in train_data.iterrows():
        sentences = [row[1][0]]
        for i in range(1, 7):
#             print(row[1][i])
            lst = [create_vector(i, j, row, row[1][i], word, 0) \
                   for j, word in enumerate(new_word_tokenize(row[1][i]))]
            sentences.append(lst)
        sentences.append(row[1][7])
        training_data.append(sentences)
#     return training_data, 1, 1
    
    development_data = []
    for row in dev_data.iterrows():
        sentences = [row[1][0]]
        for i in range(1, 7):
            lst = [create_vector(i, j, row, row[1][i], word, 1) \
                   for j, word in enumerate(new_word_tokenize(row[1][i]))]
#             lst = [np.array(list(WORD2VEC.wv[word]) + get_one_hot(1, row[0], i-1, j)) \
#                    for j, word in enumerate(new_word_tokenize(row[1][i]))]
            sentences.append(lst)
        sentences.append(row[1][7])
        development_data.append(sentences)
        
    testing_data = []
    for row in test_data.iterrows():
        sentences = [row[1][0]]
        for i in range(1, 7):
            lst = [create_vector(i, j, row, row[1][i], word, 2) \
                   for j, word in enumerate(new_word_tokenize(row[1][i]))]
#             lst = [np.array(list(WORD2VEC.wv[word]) + get_one_hot(2, row[0], i-1, j)) \
#                    for j, word in enumerate(new_word_tokenize(row[1][i]))]
            sentences.append(lst)
        testing_data.append(sentences)
        
    return training_data, development_data, testing_data

In [None]:
training_data, development_data, testing_data = embed(data[0], data[1], data[2])

In [252]:
# with open('training_embed.txt', 'w') as file:
#     file.write(str(training_data))
# with open('development_embed.txt', 'w') as file:
#     file.write(str(development_data))
# with open('testing_embed.txt', 'w') as file:
#     file.write(str(testing_data))

In [189]:
print(get_pos('Rick grew up in a troubled household.'))

Rick grew up in a troubled household.


In [None]:
class NSP(nn.Module):
    def __init__(self):
        super(NSP, self).__init__()
        self.gru1 = nn.GRU(EMBED_DIM, HIDDEN_DIM, LAYERS, batch_first=True, bidrectional=False)
        self.gru2 = nn.GRU(EMBED_DIM, HIDDEN_DIM, LAYERS, batch_first=True, bidrectional=False)
        self.linear = nn.Linear(HIDDEN_DIM+18+4, 2)
        self.softmax = nn.LogSoftmax(dim=0)
        self.criterion = nn.NLLLoss()
        self.optimizer = optim.Adam(self.parameters(), lr=LR)
        
    def setup(self, data):
        input_1 = torch.tensor(np.expand_dims(np.stack(data[1:6], axis=0), axis=0))
        input_2 = torch.tensor(np.expand_dims(np.stack((data[1:5] + [data[6]]), axis=0), axis=0))
        return input_1, input_2

    def compute_Loss(self, predicted_vector, gold_label):
        return self.criterion(predicted_vector, gold_label)

    def forward(self, data):
        input_1, input_2 = self.setup(data)
        h_0 = torch.zeros((LAYERS, 1, HIDDEN_DIM), device=DEVICE)
        __, h_n = self.gru1(inputs[0], h_0)
        output, __ = self.gru2(inputs[1], h_n)
        x = output[0][-1]
        x = self.linear(x)
        x = self.softmax(x)
        return x

In [186]:
def main():
    print('Initializing Model')
    model = NSP()
    prev_dev_acc = 0.0
    for epoch in range(EPOCHS):
        checkpoint = PATH + '-e' + str((epoch + 1))
        model.train()
        model.optimizer.zero_grad()
        loss = None
        correct = 0
        total = 0
        start_time = time.time()
        print('Training started for epoch {}'.format(epoch + 1))
        random.shuffle(training_data)
        N = len(training_data)
        for index  in tqdm(range(N)):
            model.optimizer.zero_grad()
            sample = training_data[index]
            input_vector = sample[:7]
            gold_label = sample[7]
            predicted_vector = model(input_vector)
            predicted_label = torch.argmax(predicted_vector)
            correct += int(predicted_label == gold_label)
            total += 1
            loss = model.compute_Loss(predicted_vector.view(1, -1), torch.tensor([gold_label], device=DEVICE))
            loss.backward()
            model.optimizer.step()
        print('Training accuracy for epoch {}: {}'.format(epoch + 1, correct / total))
        correct = 0
        total = 0
        start_time = time.time()
        random.shuffle(development_data)
        N = len(development_data)
        model.eval()
        model.optimizer.zero_grad()
        for index in tqdm(range(N)):
            sample = development_data[index]
            input_1 = sample[0:6]
            input_2 = sample[0:5] + [sample[6]]
            gold_label = sample[7]
            prediction_1 = model(input_1)
            prediction_2 = model(input_2)
            prob_truthful_1 = prediction_1[1]
            prob_false_1 = prediction_1[0]
            prob_truthful_2 = prediction_2[1]
            prob_false_2 = prediction_2[0]
            probs = [prob_truthful_1, prob_false_1, prob_truthful_2, prob_false_2]
            max_index = probs.index(max(probs))
            if max_index == 0 or max_index == 3:
                predicted_label = 0
            if max_index == 1 or max_index == 2:
                predicted_label = 1
            correct += int(predicted_label == gold_label)
            total += 1
        dev_acc = correct / total
        if dev_acc > prev_dev_acc and dev_acc > 0.67:
            prev_dev_acc = dev_acc
            print('New Best Accuracy: {}'.format(dev_acc))
            acc = int(100 * dev_acc)
            torch.save(model.state_dict(), checkpoint + '-a' + str(acc) + '.pt')
        print('Development accuracy for epoch {}: {}'.format(epoch + 1, correct / total))

    torch.save(model.state_dict(), PATH + '-final.pt')

In [108]:
# def replace_sparse_words(train_data, dev_data, test_data):
#     train_data = train_data.to_numpy()
#     dev_data = dev_data.to_numpy()
#     test_data = test_data.to_numpy()
#     # Count words

#     seen_vocab = {}
#     for row in train_data:
#         for i in range(1, 7):
#             for word in word_tokenize(row[i]):
#                 if seen_vocab.get(word) is None:
#                     seen_vocab[word] = 1
#                 else:
#                     seen_vocab[word] += 1
#     for row in dev_data:
#         for i in range(1, 7):
#              for word in word_tokenize(row[i]):
#                 if seen_vocab.get(word) is None:
#                     seen_vocab[word] = 1
#                 else:
#                     seen_vocab[word] += 1
#     for row in test_data:
#         for i in range(1, 7):
#              for word in word_tokenize(row[i]):
#                 if seen_vocab.get(word) is None:
#                     seen_vocab[word] = 1
#                 else:
#                     seen_vocab[word] += 1

#     # Replace words
#     new_train = []
#     for row in train_data:
#         new_pos = [row[0]]
#         new_neg = [row[0]]
#         for i in range(1, 5):
#             new_sentence = []
#             for word in word_tokenize(row[i]):
#                 new_sentence.append(word if seen_vocab[word] > 3 else '<UNK>')
#             new_pos.append(new_sentence)
#             new_neg.append(new_sentence)
#         pos_sentence = []
#         neg_sentence = []
#         if row[7] == 1:
#             for word in word_tokenize(row[5]):
#                 pos_sentence.append(word if seen_vocab[word] > 3 else '<UNK>')
#             for word in word_tokenize(row[6]):
#                 neg_sentence.append(word if seen_vocab[word] > 3 else '<UNK>')
#         else:
#             for word in word_tokenize(row[6]):
#                 pos_sentence.append(word if seen_vocab[word] > 3 else '<UNK>')
#             for word in word_tokenize(row[5]):
#                 neg_sentence.append(word if seen_vocab[word] > 3 else '<UNK>')
#         new_pos.append(pos_sentence)
#         new_neg.append(neg_sentence)
#         new_pos.append(row[7]-1)
#         new_neg.append(row[7]-1)
#         new_train.append(new_pos)
#         new_train.append(new_neg)

#     new_dev = []
#     for row in dev_data:
#         new_sample = [row[0]]
#         for i in range(1, 7):
#             new_sentence = []
#             for word in word_tokenize(row[i]):
#                 new_sentence.append(word if seen_vocab[word] > 3 else '<UNK>')
#             new_sample.append(new_sentence)
#         new_sample.append(row[7]-1)
#         new_dev.append(new_sample)

#     new_test = []
#     for row in test_data:
#         new_sample = [row[0]]
#         for i in range(1, 7):
#             new_sentence = []
#             for word in word_tokenize(row[i]):
#                 new_sentence.append(word if seen_vocab[word] > 3 else '<UNK>')
#             new_sample.append(new_sentence)
#         new_test.append(new_sample)

#     return pd.DataFrame(new_train), pd.DataFrame(new_dev), pd.DataFrame(new_test)
