## Q80

In [1]:
import pandas as pd
from pathlib import Path
from collections import Counter, defaultdict
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ekupura/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
dir_path = Path("./data")
header = ["source", "target"]
train = pd.read_table(dir_path / "train.txt", header=None, names=header)

In [4]:
def count_words(dataframe, head="source"):
    counter = Counter()
    for source in dataframe[head]:
        words = word_tokenize(source)
        counter.update(words)
    return counter

In [5]:
def generate_word2id(counter):
    sorted_counter = sorted(counter.items(), key=lambda x:x[1], reverse=True)
    word2id = defaultdict(int)
    for rank, (word, count) in enumerate(sorted_counter):
        if count > 1:
            word2id[word] = rank + 1
    return word2id

In [6]:
def generate_label2id(dataframe, head="target"):
    target_set = set(dataframe[head])
    label2id = defaultdict(int)
    for idx, target in enumerate(target_set):
        label2id[target] = idx
    return label2id

In [7]:
counter = count_words(train)
word2id = generate_word2id(counter)
label2id = generate_label2id(train)

In [8]:
def make_converted_df(dataframe, word2id, label2id, source_head="source", target_head="target"):
    converted_sources, converted_targets = [], []
    for source, target in zip(dataframe[source_head], dataframe[target_head]):
        words = word_tokenize(source)
        converted_sources.append([word2id[word] for word in words])
        converted_targets.append(label2id[target])
        
    converted_df = pd.DataFrame({source_head: converted_sources, target_head: converted_targets})
    return converted_df

In [9]:
train_df = make_converted_df(train, word2id, label2id)

In [10]:
valid = pd.read_table(dir_path / "valid.txt", header=None, names=header)
valid_df = make_converted_df(valid, word2id, label2id)

In [11]:
test = pd.read_table(dir_path / "test.txt", header=None, names=header)
test_df = make_converted_df(test, word2id, label2id)

In [12]:
train_df

Unnamed: 0,source,target
0,"[3439, 126, 5118, 115, 6632, 37, 3440, 1261, 5...",0
1,"[9, 6633, 644, 266, 2, 5120, 5121, 4126, 22, 7...",2
2,"[94, 96, 295, 0, 6, 146, 20, 0, 0]",2
3,"[679, 2272, 11, 679, 2272, 6, 0, 13, 0, 5, 776]",2
4,"[1795, 1, 1640, 224, 2, 6634, 1001, 27, 0, 10,...",0
...,...,...
10667,"[6351, 69, 35, 3492, 0, 5, 18, 0, 22, 420, 182...",2
10668,"[3110, 1734, 0, 11, 7517, 17, 55, 8885, 7, 0, ...",1
10669,"[9, 0, 0, 447, 12, 849, 111, 18, 1448, 300]",0
10670,"[6322, 1047, 0, 465, 2451, 12, 0, 127]",0


In [13]:
valid_df

Unnamed: 0,source,target
0,"[126, 59, 0, 0, 5, 13, 0, 329, 5533]",0
1,"[2990, 1508, 7502, 7503, 0, 1177, 411, 171, 50...",2
2,"[2448, 8199, 29, 28, 6610, 1592, 0]",2
3,"[2778, 528, 19, 1797, 319, 2457, 41, 33, 3087,...",2
4,"[1284, 0, 3464, 0, 2, 655, 1641, 5194]",0
...,...,...
1329,"[500, 233, 64, 1655, 590, 1351, 21, 1811, 592,...",2
1330,"[328, 4669, 15, 4366, 992, 38, 20, 47, 23, 1034]",3
1331,"[0, 282, 1, 256, 4466, 852, 1, 0, 1001, 7, 4203]",0
1332,"[9, 0, 2444, 2, 0, 399, 6, 849, 1350, 853]",0


In [14]:
test_df

Unnamed: 0,source,target
0,"[738, 0, 4, 1854, 915, 44, 17, 3158, 0]",1
1,"[1352, 770, 4510, 19, 71, 0, 3234, 1, 43, 2344...",0
2,"[912, 466, 1398, 58, 203, 12, 71, 134, 6, 341,...",3
3,"[2470, 3549, 19, 2102, 1, 2225, 115, 21, 1708,...",0
4,"[377, 419, 0, 2589, 22, 8264, 5, 36, 252, 842]",3
...,...,...
1329,"[3028, 167, 5, 0, 473, 482, 2131, 27, 2578, 0]",0
1330,"[1028, 317, 5305, 0, 316, 293, 852]",3
1331,"[1717, 7391, 0, 1559, 4822, 1, 3034, 59, 172, ...",3
1332,"[2103, 4620, 664, 2, 437, 3838, 2131, 41, 20, ...",3


## Q81

In [26]:
device="cpu"

In [27]:
import torch
import torch.nn.utils.rnn as rnn
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score

In [28]:
class RNN(torch.nn.Module):
    def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length):
        super().__init__()

        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length

        self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
        self.rnn = nn.RNN(embedding_length, hidden_size, num_layers=1, bidirectional=False)
        self.label = nn.Linear(hidden_size, output_size)

    def forward(self, input_sentences, batch_size=None):
        input_embeddings = self.word_embeddings(input_sentences)
        input_embeddings = input_embeddings.permute(1, 0, 2)
        if batch_size is None:
            h_0 = torch.zeros(1, self.batch_size, self.hidden_size).to(device)
        else:
            h_0 =  torch.zeros(1, batch_size, self.hidden_size).to(device)
        
        output, h_n = self.rnn(input_embeddings, h_0)
        
        h_n = h_n.permute(1, 0, 2)
        h_n = h_n.squeeze(1)
        logits = self.label(h_n)

        return logits

In [29]:
sample_input = torch.tensor(train_df["source"][0]).unsqueeze(0)

In [30]:
vocab_size = max(word2id.values()) + 1

In [31]:
model = RNN(1, 4, 200, vocab_size, 200)
model(sample_input)

tensor([[ 0.2211,  0.1122, -0.1986,  0.0815]], grad_fn=<AddmmBackward>)

## Q82

In [364]:
from torch.utils.data import Dataset, DataLoader
import copy

In [366]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, sources, targets):
        self.len = len(sources)
        self.sources = [torch.tensor(source)for source in sources]
        self.targets = torch.tensor(targets)

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.sources[idx], self.targets[idx]

In [367]:
def collate_fn(data):
    sources = torch.t(nn.utils.rnn.pad_sequence([s[0] for s in data]))
    targets = torch.tensor([s[1] for s in data])
    return sources, targets

In [393]:
class Trainer:
    def __init__(self, model, batch_size):
        self.model = model
        self.batch_size = batch_size
        self.optim = torch.optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=0.001)
        self.loss_fn = F.cross_entropy
    
    @staticmethod
    def score(true, pred):
        pred_cpu = torch.argmax(pred, dim=1).long().detach().cpu()
        true_cpu = true.long().detach().cpu()
        acc = accuracy_score(true_cpu, pred_cpu)
        f1 = f1_score(true_cpu, pred_cpu, average="macro")
        return acc, f1

    @staticmethod
    def print_progress(prefix, epoch, loss, acc, f1):
        args = (prefix, epoch + 1, loss, acc, f1)
        print("Type: {}, Epoch: {}, Loss: {:.4f}, Acc: {:.4f}, F1: {:.4f}".format(*args))

    @staticmethod
    def to_gpu(source, target):
        return source.to(device), target.to(device)

    def train_model(self, train_iter, epoch):
        total_epoch_loss, total_epoch_acc, total_epoch_f1, steps = 0, 0, 0, 0
        self.model.train()
        for idx, (source, target) in enumerate(train_iter):
            if source.size()[0] is not self.batch_size:
                continue
            source, target = self.to_gpu(source, target)

            self.optim.zero_grad()
            prediction = self.model(source)
            loss = self.loss_fn(prediction, target)
            loss.backward()
            self.optim.step()

            steps += 1
            acc, f1 = self.score(target, prediction)

            if steps % 100 == 0:
                # self.print_progress("train", epoch, loss.item(), acc, f1)
                pass
            
            total_epoch_loss += loss.item()
            total_epoch_acc += acc
            total_epoch_f1 += f1

        n_iter = len(train_iter)
        return total_epoch_loss / n_iter, total_epoch_acc / n_iter, total_epoch_f1 / n_iter


    def eval_model(self, val_iter):
        total_epoch_loss, total_epoch_acc, total_epoch_f1 = 0, 0, 0

        self.model.eval()
        with torch.no_grad():
            for idx, (source, target) in enumerate(val_iter):
                source, target = self.to_gpu(source, target)
                if (source.size()[0] is not self.batch_size):
                    continue
                prediction = self.model(source)
                loss = self.loss_fn(prediction, target)
                acc, f1 = self.score(target, prediction)

                total_epoch_loss += loss.item()
                total_epoch_acc += acc
                total_epoch_f1 += f1

        n_iter = len(val_iter)
        return total_epoch_loss / n_iter, total_epoch_acc / n_iter, total_epoch_f1 / n_iter

    def train(self, train, valid, test, n_iter=30):
        min_val_loss = 1e+10
        best_state_dict = None

        self.model.to(device)
        for epoch in range(n_iter):
            train_loss, train_acc, train_f1 = self.train_model(train, epoch)
            self.print_progress("train", epoch, train_loss, train_acc, train_f1)
            val_loss, val_acc, val_f1 = self.eval_model(valid)
            self.print_progress("valid", epoch, val_loss, val_acc, val_f1)

            # save best model
            if min_val_loss > val_loss:
                min_val_loss = val_loss
                best_state_dict = copy.deepcopy(self.model.state_dict())

        self.model.load_state_dict(best_state_dict)
        test_loss, test_acc, test_f1 = self.eval_model(test)
        self.print_progress("test_", epoch, val_loss, val_acc, val_f1)

        return best_state_dict


In [394]:
batch_size=1
output_size=4
hidden_size=200
embedding_length=200

In [395]:
train_set = MyDataset(train_df["source"], train_df["target"])
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
valid_set = MyDataset(valid_df["source"], valid_df["target"])
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_set = MyDataset(test_df["source"], test_df["target"])
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [396]:
vocab_size = max(word2id.values()) + 1

In [397]:
model = RNN(batch_size=batch_size, output_size=output_size, hidden_size=hidden_size, vocab_size=vocab_size, embedding_length=embedding_length)
trainer = Trainer(model, batch_size=batch_size)

In [None]:
trainer.train(train_loader, valid_loader, test_loader)

Type: train, Epoch: 1, Loss: 0.9528, Acc: 0.6606, F1: 0.6606
Type: valid, Epoch: 1, Loss: 0.7597, Acc: 0.7309, F1: 0.7309
Type: train, Epoch: 2, Loss: 0.6456, Acc: 0.7737, F1: 0.7737
Type: valid, Epoch: 2, Loss: 0.6662, Acc: 0.7511, F1: 0.7511
Type: train, Epoch: 3, Loss: 0.4915, Acc: 0.8237, F1: 0.8237
Type: valid, Epoch: 3, Loss: 0.6673, Acc: 0.7789, F1: 0.7789


In [None]:
## Q82

In [None]:
batch_size=32
output_size=4
hidden_size=200
embedding_length=200

In [None]:
train_set = MyDataset(train_df["source"], train_df["target"])
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
valid_set = MyDataset(valid_df["source"], valid_df["target"])
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_set = MyDataset(test_df["source"], test_df["target"])
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
model = RNN(batch_size=batch_size, output_size=output_size, hidden_size=hidden_size, vocab_size=vocab_size, embedding_length=embedding_length)
trainer = Trainer(model, batch_size=batch_size)

In [None]:
trainer.train(train_loader, valid_loader, test_loader)