# **GOOGLE DRIVE**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%cd '/content/gdrive/MyDrive/Class/Year2/sem2/CSE354: Natural Language Processing/Homework4'

/content/gdrive/MyDrive/Class/Year2/sem2/CSE354: Natural Language Processing/Homework4


In [None]:
pip install torch==1.4.0 torchvision==0.5.0



# **Part 1**

## **1.1**



In [None]:
import torch
import torch.nn as nn  #pytorch
import json #for reading json encoded files into opjects
import sys
import re #regular expressions
import numpy as np
import pandas as pd
import csv
from pprint import pprint
import gensim.downloader as api
from gensim.utils import tokenize
import torch.nn.functional as F
word_embs = api.load('glove-wiki-gigaword-50')
# sys.stdout = open('a4_lastname_id_OUTPUT.txt', 'w')

In [None]:
def loadData(filename):
  data = []
  with open(filename, 'r') as infile:
    data = json.load(infile)
  return data

## **1.2**

In [None]:
def tokenize_data(data):
  for record in data:
    record['question_toks'] = list(tokenize(record['question'], lowercase=True))
    record['passage_toks'] = list(tokenize(record['passage'], lowercase=True))
  return data

In [None]:
def get_embed(word):
  if word in word_embs.vocab:
    return word_embs.word_vec(word)
  else:
    return word_embs.word_vec('unk')

In [None]:
def get_inputs(data):
  input = []
  for record in data:
    x = list(np.array(list(get_embed(word))) for word in list(record['passage_toks']) + list(record['question_toks']))
    x = torch.from_numpy(np.array(x))
    input.append(x)
  return input

## **1.3**

In [None]:
class GRU_RNN(nn.Module):

  def __init__(self, embedding_dim=50, gru_hidden_dim=50, number_of_labels=2):
    super(GRU_RNN, self).__init__()
    self.gru = nn.GRU(embedding_dim, gru_hidden_dim)
    self.linearClassifier = nn.Linear(gru_hidden_dim, number_of_labels)
        
  def forward(self, X):        
    vecs = []
    for doc in X:
      s, _ = self.gru(doc.unsqueeze(1)) 
      vecs.append(s[-1])
    vecs = torch.stack(vecs).squeeze(1)  
    vecs = self.linearClassifier(vecs)
    yprobs = F.softmax(vecs, dim=1)
    return yprobs

In [None]:
train_data = loadData('music_QA_train.json')
trial_data = loadData('music_QA_dev.json')
test_data = loadData('music_QA_test.json')

train_data = tokenize_data(train_data)
trial_data = tokenize_data(trial_data)
test_data = tokenize_data(test_data)

train_input = get_inputs(train_data)
train_output = torch.from_numpy(np.array([[0,1] if x['label'] else [1,0] for x in train_data]))
train_output = train_output.type(torch.FloatTensor)


#Model setup:
learning_rate, epochs = 0.1, 10
model = GRU_RNN(50, 50, 2)
sgd = torch.optim.SGD(model.parameters(), lr=learning_rate)
lossfunc = nn.BCELoss()

# # training loop:
for i in range(epochs):
  model.train()
  sgd.zero_grad()
  #forward pass:
  ypred = model(train_input)
  loss = lossfunc(ypred, train_output)
  #backward:
  loss.backward()
  sgd.step()

## **1.4**

In [None]:
trial_input = get_inputs(trial_data)
trial_output = torch.from_numpy(np.array([[0,1] if x['label'] else [1,0] for x in trial_data]))
trial_y = torch.from_numpy(np.array([1 if x['label'] else 0 for x in trial_data]))
trial_output = trial_output.type(torch.FloatTensor)

with torch.no_grad(): 
  trialpred_prob = model(trial_input).numpy()
  trialpred_class = [np.where(prob == max(prob))[0][0] for prob in trialpred_prob]
  count = np.sum(np.array(trialpred_class) == np.array(trial_y))
  print("TRIAL DATA CORRECT: ", count, " out of ", len(trial_y))
  print("TRIAL DATA ACCURACY: ", count/len(trial_y)*100, "%")

TRIAL DATA CORRECT:  61  out of  85
TRIAL DATA ACCURACY:  71.76470588235294 %
