# IMPORTS

In [None]:
import json
import gensim.downloader as api
from gensim.utils import tokenize
import numpy as np 

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

! pip install datasets transformers # uncomment to install if needed
from transformers import AutoTokenizer, AutoModelForSequenceClassification, default_data_collator, TrainingArguments, Trainer, pipeline
from datasets import Dataset, load_metric


Collecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/46/1a/b9f9b3bfef624686ae81c070f0a6bb635047b17cdb3698c7ad01281e6f9a/datasets-1.6.2-py3-none-any.whl (221kB)
[K     |████████████████████████████████| 225kB 8.5MB/s 
[?25hCollecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 14.1MB/s 
Collecting huggingface-hub<0.1.0
  Downloading https://files.pythonhosted.org/packages/32/a1/7c5261396da23ec364e296a4fb8a1cd6a5a2ff457215c6447038f18c0309/huggingface_hub-0.0.9-py3-none-any.whl
Collecting fsspec
[?25l  Downloading https://files.pythonhosted.org/packages/bc/52/816d1a3a599176057bf29dfacb1f8fadb61d35fbd96cb1bab4aaa7df83c0/fsspec-2021.5.0-py3-none-any.whl (111kB)
[K     |████████████████████████████████| 112kB 50.9MB/s 
Collecting xxhash
[?25l  Downloading https://files.pytho

In [None]:
word_embs = api.load('glove-wiki-gigaword-50')



------
# GRU MODEL

In [None]:
class GRU_RNN(nn.Module):
    def __init__(self, embedding_dim, gru_hidden_dim, number_of_labels):
        super(GRU_RNN, self).__init__()

        self.gru = nn.GRU(embedding_dim, gru_hidden_dim)

        self.linearClassifier = nn.Linear(gru_hidden_dim, number_of_labels)
    
    def forward(self, X):
        # Defines how the gry will run:
        # X: a list of FloatTensors: [tensor([...]), tensor([...]), ...]
        # each tensor contains a matrix of input embeddings
        doc_vecs = []
        for doc in X:
            s, _ = self.gru(doc.unsqueeze(1))
            doc_vecs.append(s[-1])

        doc_vecs = torch.stack(doc_vecs).squeeze(1)
        doc_vecs = self.linearClassifier(doc_vecs)

        yprobs = F.softmax(doc_vecs)
        return yprobs


In [None]:
def get_word(word):
    try:
        return word_embs[word]
    except KeyError:
        return word_embs['unk']

In [None]:
data = []
dev = []
with open('music_QA_train.json', 'r') as infile:
    data = json.load(infile)
with open('music_QA_dev.json', 'r') as infile:
    dev = json.load(infile)

In [None]:
for record in data:
    record['question_toks'] = list(tokenize(record['question'], lowercase=True))
    record['passage_toks'] = list(tokenize(record['passage'], lowercase=True))
for record in dev:
    record['question_toks'] = list(tokenize(record['question'], lowercase=True))
    record['passage_toks'] = list(tokenize(record['passage'], lowercase=True))

In [None]:
X = [] # going to be our input
X_test = []
y = []
yt = []

for record in data:
    doc_emb = [get_word(word) for word in (record['passage_toks'] + record['question_toks'])]
    input = torch.FloatTensor(doc_emb)
    X.append(input)
    y.append(1 if record['label'] == True else 0)

for record in dev:
    doc_emb = [get_word(word) for word in (record['passage_toks'] + record['question_toks'])]
    input = torch.FloatTensor(doc_emb)
    X_test.append(input)
    yt.append(1 if record['label'] == True else 0)

y_train = torch.FloatTensor(y)
y_test = torch.FloatTensor(yt)

In [None]:
model = GRU_RNN(50, 50, 2)
# model = GRU_RNN(X[0][0], X[0][0], len(set(y)))
sgd = torch.optim.SGD(model.parameters(), lr=0.1)
loss_func = nn.BCELoss() # MSELoss works better

#training loop:
epochs = 10
for i in range(epochs):
    model.train()
    sgd.zero_grad()

    #forward pass:
    y_pred = model(X)
    y_pred_f = []#torch.Tensor([])
    for pred in y_pred:
        y_pred_f.append(pred[1])
    y_pred = torch.stack(y_pred_f) # tensor([pr0, pr1]) -> x tensor([0|1]) x -> tensor(tensor([pr1]))

    loss = loss_func(y_pred, y_train)
    #backward:
    loss.backward()
    sgd.step()
    print("  epoch: %d, loss: %f" %(i, loss.item()))



  epoch: 0, loss: 0.698151
  epoch: 1, loss: 0.686873
  epoch: 2, loss: 0.681016
  epoch: 3, loss: 0.677850
  epoch: 4, loss: 0.676031
  epoch: 5, loss: 0.674891
  epoch: 6, loss: 0.674094
  epoch: 7, loss: 0.673473
  epoch: 8, loss: 0.672942
  epoch: 9, loss: 0.672459


In [None]:
#calculate accuracy on test set:
with torch.no_grad():
    ytest_pred = model(X_test)



In [None]:
predictions = [1 if pred[0]<pred[1] else 0 for pred in ytest_pred] # convert predictions from probs to 0,1
# appends 1 to list for every answer we got correct on the testing set
correct = len([1 for i in range(len(y_test)) if predictions[i] == y_test[i]]) 
# gives us % correct answers
correct/len(y_test)

0.7294117647058823

---------
# Transformer Models

In [None]:
# reload the data
train_data = []
dev_data = []
with open('music_QA_train.json', 'r') as infile:
    train_data = json.load(infile)
with open('music_QA_dev.json', 'r') as infile:
    dev_data = json.load(infile)

In [None]:
labels_train = []
labels_dev = []
for d in train_data:
  d['label'] = 1 if d['label'] else 0
  labels_train.append(str(d['label']))
for d in dev_data:
  d['label'] = 1 if d['label'] else 0
  labels_dev.append(str(d['label']))

In [None]:
train_data = train_data[:-3]
dev_data = dev_data[:-3]
labels_train = labels_train[:-3]
labels_dev = labels_dev[:-3]

In [None]:
model_checkpoint = "bert-base-uncased"
batch_size = 8
num_labels = 2
loss_func = nn.BCEWithLogitsLoss()

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = num_labels)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
train_dataset = Dataset.from_dict({k: [d[k] for d in train_data] for k in train_data[0]})
dev_dataset = Dataset.from_dict({k: [d[k] for d in dev_data] for k in dev_data[0]})

In [None]:
# Taken from Hugging Face [SQuAD|Text Classification] Tutorial
def preprocess_fn_dev(data):
  if key_2 is None:
    return tokenizer(data[key_1], labels_dev, truncation=True)
  else:
    return tokenizer(data[key_1], data[key_2], truncation=True)

def preprocess_fn_train(data):
  if key_2 is None:
    return tokenizer(data[key_1], labels_train, truncation=True)
  else:
    return tokenizer(data[key_1], data[key_2], truncation=True)

def preprocess_fn(data):
  if key_2 is None:
    return tokenizer(data[key_1], truncation=True)
  else:
    return tokenizer(data[key_1], data[key_2], truncation=True)

In [None]:
metric = load_metric('glue', 'qnli')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1848.0, style=ProgressStyle(description…




In [None]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = torch.argmax(torch.from_numpy(predictions), dim=1)
  return metric.compute(predictions=predictions, references=labels)

### QUESTION-ONLY MODEL

In [None]:
q_training_data = train_dataset.remove_columns('passage')
q_dev_data = dev_dataset.remove_columns('passage')

In [None]:
q_training_data

Dataset({
    features: ['label', 'question', 'idx'],
    num_rows: 416
})

In [None]:
key_1 = 'question'
key_2 = None

In [None]:
encoded_train_dataset = q_training_data.map(preprocess_fn_train, batched=True)
encoded_dev_dataset = q_dev_data.map(preprocess_fn_dev, batched=True)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [None]:
# Taken from Hugging Face [SQuAD|Text Classification] Tutorial
args = TrainingArguments(
    "test-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model,
    args,
    train_dataset = encoded_train_dataset,
    eval_dataset = encoded_dev_dataset,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.002553,1.0
2,No log,0.000815,1.0
3,No log,0.00055,1.0
4,No log,0.000448,1.0
5,No log,0.00042,1.0


TrainOutput(global_step=260, training_loss=0.038654151329627406, metrics={'train_runtime': 45.1228, 'train_samples_per_second': 5.762, 'total_flos': 21404516534112.0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 1735163904, 'init_mem_gpu_alloc_delta': 439072256, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 11640832, 'train_mem_gpu_alloc_delta': 1319731712, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 122202624})

In [None]:
trainer.evaluate()

{'epoch': 5.0,
 'eval_accuracy': 1.0,
 'eval_loss': 0.0004199657996650785,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 6774784,
 'eval_runtime': 0.3826,
 'eval_samples_per_second': 214.328}

### PASSAGE ONLY MODEL

In [None]:
p_training_data = train_dataset.remove_columns('question')
p_dev_data = dev_dataset.remove_columns('question')

In [None]:
key_1 = 'passage'
key_2 = None

In [None]:
encoded_train_dataset = p_training_data.map(preprocess_fn_train, batched=True)
encoded_dev_dataset = p_dev_data.map(preprocess_fn_dev, batched=True)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [None]:
# Taken from Hugging Face [SQuAD|Text Classification] Tutorial
args = TrainingArguments(
    "test-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model,
    args,
    train_dataset = encoded_train_dataset,
    eval_dataset = encoded_dev_dataset,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.3e-05,1.0
2,No log,1.2e-05,1.0
3,No log,9e-06,1.0
4,No log,7e-06,1.0
5,No log,7e-06,1.0


TrainOutput(global_step=260, training_loss=5.38034257120811e-05, metrics={'train_runtime': 132.5473, 'train_samples_per_second': 1.962, 'total_flos': 364086989933664.0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -1388544, 'train_mem_gpu_alloc_delta': 878710784, 'train_mem_cpu_peaked_delta': 1437696, 'train_mem_gpu_peaked_delta': 5898790400})

In [None]:
trainer.evaluate()

{'epoch': 5.0,
 'eval_accuracy': 1.0,
 'eval_loss': 6.8850399657094385e-06,
 'eval_mem_cpu_alloc_delta': 12288,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 123987456,
 'eval_runtime': 1.4276,
 'eval_samples_per_second': 57.438}

### COMBINED TRANSFORMER

In [None]:
key_1 = 'question'
key_2 = 'passage'

In [None]:
encoded_train_dataset = train_dataset.map(preprocess_fn_train, batched=True)
encoded_dev_dataset = dev_dataset.map(preprocess_fn_dev, batched=True)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [None]:
# Taken from Hugging Face [SQuAD|Text Classification] Tutorial
args = TrainingArguments(
    "test-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model,
    args,
    train_dataset = encoded_train_dataset,
    eval_dataset = encoded_dev_dataset,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.645334,0.707317
2,No log,0.648377,0.695122
3,No log,0.831167,0.609756
4,No log,0.890333,0.670732
5,No log,0.913184,0.646341


TrainOutput(global_step=260, training_loss=0.4368588374211238, metrics={'train_runtime': 140.5366, 'train_samples_per_second': 1.85, 'total_flos': 375559138127616.0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -2019328, 'train_mem_gpu_alloc_delta': 878712320, 'train_mem_cpu_peaked_delta': 2068480, 'train_mem_gpu_peaked_delta': 6065515008})

In [None]:
trainer.evaluate()

{'epoch': 5.0,
 'eval_accuracy': 0.6463414634146342,
 'eval_loss': 0.9131839871406555,
 'eval_mem_cpu_alloc_delta': 12288,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 129245696,
 'eval_runtime': 1.4661,
 'eval_samples_per_second': 55.932}

### TESTING ON TEST SET

In [None]:
# reload the data
test_data = []
with open('music_QA_test.json', 'r') as infile:
    test_data = json.load(infile)

In [None]:
input = []
for d in test_data:
  text = d['question'] + d['passage']
  if len(text)>512:
    text = text[:512] # truncate
  input.append(text)


In [None]:
ids = []
for d in test_data:
  ids.append(d['idx'])

In [None]:
text_classifier_pipeline = pipeline(
    "text-classification",
    model = model,
    tokenizer = tokenizer,
    device=0
)

In [None]:
labels = []
for i in input:
 labels.append(int(text_classifier_pipeline(i)[0]['label'][-1]))

In [None]:
import pandas as pd

data = {'idx': ids, 'label': labels}
df = pd.DataFrame(data=data)

In [None]:
df.to_csv('solutions.csv', index=False)