# Mount Google Drive

In [25]:
from google.colab import drive
import sys
import os
drive.mount('/content/gdrive/')
# if not os.path.exists('/content/data'):
!cp -r /content/gdrive/MyDrive/Aicup_spring/data /content
# if not os.path.exists('/content/stopword'):
!cp -r /content/gdrive/MyDrive/Aicup_spring/stopword /content
# if not os.path.exists('/content/dataset'):
#   !cp -r /content/gdrive/MyDrive/Aicup_spring/dataset /content


Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


# The Encoder from Transformer
---
paper link: \\
[Hierarchical Attention Networks for Document Classification](https://www.aclweb.org/anthology/N16-1174.pdf) \\
[Attention Is All You Need](https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf)



In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math


class PositionalEncoding(nn.Module):
    def __init__(self, d_emb: int, dropout: float = 0.1, max_len: int = 200):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        self.pe = torch.zeros(max_len, d_emb)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_emb, 2) *
                             -(math.log(10000.0) / d_emb))
        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0)

    def forward(self, src):
        pe = self.pe.detach().to(src.device)
        output = src + pe[:, :src.size(1)]
        return self.dropout(output)


class MultiHeadAttention(nn.Module):
    def __init__(self, d_hid, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.d_k = d_hid // n_head
        self.query = nn.Parameter(torch.randn(1, d_hid))
        self.key = nn.Linear(d_hid, d_hid)
        self.value = nn.Linear(d_hid, d_hid)
        self.linear = nn.Linear(d_hid, d_hid)

    def forward(self, x, batch_tk_mask):
        B = x.size(0)

        # Input  shape: `(1, Hid)`.
        # Output shape: `(Head, 1, K)`.
        q = self.query.view(-1, self.n_head, self.d_k).transpose(0, 1)

        # Transform temporal features to query, key and value features.
        # Input  shape: `(B, S, Hid)`.
        # Output shape: `(B, Head, S, K)`.
        k = self.key(x).view(B, -1, self.n_head, self.d_k).transpose(1, 2)
        v = self.value(x).view(B, -1, self.n_head, self.d_k).transpose(1, 2)

        # Calculate self attention scores with query and key features.
        # Self attention scores are scaled down by hidden dimension square
        # root to avoid overflow.
        # `q` Input  shape: `(Head, 1, K)`.
        # `k` Input  shape: `(B, Head, S, K)`.
        # Output shape: `(B, Head, 1, S)`.
        # print(q.shape)
        # print(k.shape)
        attn = q @ k.transpose(-1, -2) / math.sqrt(x.size(-1))

        # Mask parts of attention scores by replacing with large negative
        # values.
        # Input  shape: `(B, Head, 1, S)`.
        # Output shape: `(B, Head, 1, S)`.
        batch_tk_mask = batch_tk_mask.repeat(self.n_head, 1, 1, 1)

        batch_tk_mask = batch_tk_mask.transpose(0, 1)
        batch_tk_mask = batch_tk_mask.transpose(-1, -2)
        # print(attn.shape)
        # print(batch_tk_mask.shape)
        attn.masked_fill_(batch_tk_mask, -1e9)

        # Softmax normalize on attention scores.
        # Large negative values will be closed to zero after normalization.
        # Input  shape: `(B, Head, 1, S)`.
        # Output shape: `(B, Head, 1, S)`.
        attn = F.softmax(attn, dim=-1)

        # Use attention scores to calculate weighted sum on value features.
        # Then perform one more linear tranformation on weighted sum.
        # Finally dropout transformed features.
        # `attn` Input  shape: `(B, Head, 1, S)`.
        # `v` Input  shape: `(B, Head, S, k)`.
        # Output shape: `(B, Head, 1, K)`.
        output = attn @ v

        # Input  shape: `(B, Head, 1, K)`.
        # Output shape: `(B, 1, Hid)`.
        output = output.transpose(1, 2).contiguous()
        output = output.view(B, -1, self.n_head * self.d_k)

        # Output shape: `(B, Hid)`.
        return self.linear(output.squeeze(1))


class Encoder(nn.Module):
    def __init__(self, d_emb: int, p_hid: float):
        super().__init__()
        self.linear = nn.Linear(d_emb, d_emb)
        self.pe = PositionalEncoding(d_emb, p_hid)
        self.attn_emb = MultiHeadAttention(d_emb, 4)
        self.layernorm1 = nn.LayerNorm(d_emb)
        self.layernorm2 = nn.LayerNorm(d_emb)

    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        # Shape: [B, S, H]
        emb = self.layernorm1(self.linear(x))

        # Shape: [B, S, H]
        emb = self.pe(emb)

        # Shape: [B, H]
        emb = self.layernorm2(self.attn_emb(emb, mask))

        return emb


# Read Stopword

In [23]:
f=open(os.path.join('stopword','ch_stopword.txt'),mode='r')
stopword_list=f.read().splitlines()

# print(stopword_list)

# Dataset.py


In [22]:
import numpy as np
import csv
import json
import unicodedata
import re
import jieba
from torch.utils.data import Dataset, DataLoader
import random


'''
Here we will do preprocessing on the dataset.
Something needs to be done here :
1. Read the file in.
2. Separate the article, question, answer.
3. Used PAD to round each sentence into the same length
'''


def split_sent(sentence: str):
    first_role_idx = re.search(':', sentence).end(0)
    out = [sentence[:first_role_idx]]

    tmp = sentence[first_role_idx:]
    # print('sen',sentence)
    # print('tmp',tmp)
    # print('out',out)
    jieba.add_word("個管師")
    jieba.add_word('性行為')


    out2=list(jieba.cut(sentence))
    # print('out2',out2)
    for i in out2:
      if i in stopword_list:
        out2.remove(i)
      if i=='護理師' or i=='醫師' or i=='民眾' or i=='家屬' or i=='個管師':
        out2.remove(i)

    while tmp:
        res = re.search(
            r'(護理師[\w*]\s*:|醫師\s*:|民眾\s*:|家屬[\w*]\s*:|個管師\s*:)', tmp)
        if res is None:
            break

        idx = res.start(0)
        idx_end = res.end(0)
        out[-1] = list(out[-1] + tmp[:idx])
        out.append(tmp[idx:idx_end])
        tmp = tmp[idx_end:]

        # print("out:",tmp)
        # input()

    out[-1] = list(out[-1] + tmp)


    # Remove stopword
    for i in out:
      if (i in stopword_list):
        out.remove(i)
    # print('out2',out2)
    

    return out2

def _read_risk(risk_file: str):
    article = []
    risk = []

    # [[Sent_1], [Sent_2], ..., [Sent_n]]
    for i, line in enumerate(csv.reader(open(risk_file, "r", encoding="utf-8"))):
        if i == 0:
            continue
        text = unicodedata.normalize("NFKC", line[2]).replace(" ", "")
        article.append(split_sent(text))

        # print(risk_file)
        # input()

        if risk_file != "data/Develop_risk_classification.csv":
          risk.append(int(line[3]))
        else:
          risk.append(-1)

    return article, risk

def _read_qa(qa_file: str):
    qa = {}
    # [Question, [[Choice_1, Answer_1], [Choice_2, Answer_2], [Choice_3, Answer_3]]]
    for data in json.loads(unicodedata.normalize("NFKC", open(qa_file, "r", encoding="utf-8").read())):
        question = data["question"]
        choice_ans = []

        if qa_file != "data/Develop_QA.json":
          choice_ans = [(
                  list(choice["text"]),
                  int(choice["label"].strip() == data["answer"].strip())
              ) for choice in question["choices"]]
        else:
          choice_ans = [(
                  list(choice["text"]),
                  -1)
              for choice in question["choices"]]

        question_text = list(question["stem"])
        aid = data["article_id"]
        if aid in qa:
            qa[aid][1].append((question_text, choice_ans))
        else:
            qa[aid] = (split_sent(data['text'].replace(" ", "")), [(question_text, choice_ans)])

    return zip(*[v for _,v in sorted(qa.items(),key=lambda x:x[0])])


def encode_sent(w2id: dict, sentence: list, max_length: int):
    output = []
    for i, token in enumerate(sentence):
        if i >= max_length:
            break
        if token in w2id:
            output.append(w2id[token])
        else:
            output.append(0)
    padding_word = [0]
    sent_padding_size = max_length - len(output)
    output = output + padding_word*sent_padding_size

    return output


def encode_articles(article_text, max_doc_len, w2id, max_sent_len):
    article = []
    for document in article_text:
        article.append([])
        for i, sentence in enumerate(document):
            if i >= max_doc_len:
                break
            article[-1].append([])
            article[-1][-1] = encode_sent(w2id, sentence, max_sent_len)
        padding_sent = [[0]*max_sent_len]
        doc_padding_size = max_doc_len - len(article[-1])
        article[-1] = article[-1] + padding_sent*doc_padding_size
    return np.array(article)


class dataset_qa(Dataset):
    def __init__(
        self,
        vocab_path: str,
        qa_file: str,
        max_sent_len: int = 52,
        max_doc_len: int = 170,
        max_q_len: int = 20,
        max_c_len: int = 18
    ):
        super().__init__()
        with open(vocab_path, 'r', encoding='utf-8') as f_w2id:
            w2id = json.load(f_w2id)

        article_text, qa_pairs = _read_qa(qa_file)

        # `article` shape: [N, `max_doc_len`, `max_sent_len`]
        self.article = encode_articles(
            article_text, max_doc_len, w2id, max_sent_len)

        self.QA = []
        for idx, qa_pair in enumerate(qa_pairs):
            for question, choice_ans in qa_pair:
                choice, ans = zip(*choice_ans)
                
                self.QA.append({
                    "article": self.article[idx],
                    "question": np.array(encode_sent(w2id, question, max_q_len)),
                    "choice": np.array([encode_sent(w2id, x, max_c_len) for x in choice]),
                    "qa_answer": np.array(ans),
                })
        # print(self.QA['qa_answer'])
        # input()

    def __len__(self):
        return len(self.QA)

    def __getitem__(self, idx: int):
        return self.QA[idx]


class dataset_risk(Dataset):
    def __init__(
        self,
        vocab_path: str,
        risk_file: str,
        max_sent_len: int = 52,
        max_doc_len: int = 170,
    ):
        super().__init__()
        with open(vocab_path, 'r', encoding='utf-8') as f_w2id:
            w2id = json.load(f_w2id)
        # w2id = {"[PAD]": 0}
        article_text, risk = _read_risk(risk_file)

        # print(article_text)
        # input()

        # `risk` shape: [N]
        self.risk = np.array(risk, dtype=np.float32)

        # `article` shape: [N, `max_doc_len`, `max_sent_len`]
        self.article = encode_articles(
            article_text, max_doc_len, w2id, max_sent_len)
        
        # print(self.article)
        # input()

    def __len__(self):
        return len(self.risk)

    def __getitem__(self, idx: int):
        return {"article": self.article[idx], "risk_answer": self.risk[idx]}


# Risk Classificaion

## Take a look at the risk classification dataset

In [None]:
import os
import pandas as pd
import json
# from dataset import dataset_risk

risk_file=os.path.join("data", "SampleData_RiskClassification.csv")
print('risk data:')
print(pd.read_csv(risk_file,usecols=['article_id','text','label']))
print('\n---------------------------------')
print('vocab:')
print(list(json.load(open(os.path.join("data", "vocab.json"))).items())[:10])

dataset = dataset_risk(
    vocab_path=os.path.join("data", "vocab.json"),
    risk_file=risk_file,
)
d = next(iter(dataset))
print('\n---------------------------------')
print('text\n', d['article'])
print(d['article'].shape)
r = {v: k for k, v in json.load(open(os.path.join("data", "vocab.json"))).items()}
print(*[''.join(map(lambda x:r[x], i))[:40] for i in d['article'][:4,:] ], sep='\n')
print('\n---------------------------------')
print('answer:\n', d['risk_answer'])

Building prefix dict from the default dictionary ...


risk data:
    article_id                                               text  label
0            1  個管師：這個月還好嗎？民眾：蛤？個管師：這個月還好嗎？民眾：這個月還好。個管師：還好，還可以...      1
1            2  個管師：所以你這個月還OK？民眾：還OK啊。個管師：那有固定伴侶嗎？民眾：沒有。個管師：你一...      1
2            3  個管師：其實你這樣子的吃法是吃每天嗎？民眾：對我是吃每天。個管師：啊你有固定的時間嗎？民眾：...      0
3            4  個管師：等一下他先填問卷，不會有聲音。民眾：哈哈哈。個管師：等一下。個管師：這兩個月還好嗎？...      0
4            5  個管師：這兩個月都還好嗎？民眾：目前都還好，就除了你說的HPV的狀況。個管師：恩，那問一下你...      0
5            6  醫師：好啦，那所以我先講，你是前二個月有抽血？民眾：對。醫師：抽血狀況看起來還可以。民眾：是...      0
6            7  醫師：那所以我們現在吃兩個月了嘛。民眾：對。醫師：嘿。你現在還是天天吃？民眾：對，但到目前還...      1
7            8  醫師：還好嗎？還可以？民眾：嗯。醫師：你今天來得這麽早欸。民眾：對啊，今天車位比較好找。醫師...      0
8            9  醫師：好啦。那這個月還好嗎？民眾：就……狀態是還好，只是有HPV。醫師：嗯哼。喔你看皮膚科。...      1
9           10  醫師：你過去兩個月吃藥是，你還是天天吃嗎？民眾：對，天天吃。醫師：所以比方說像，兩個月以內幾...      0
10          11  醫師：主要是做、做發燒的一些檢查，初步看有抽血還有照相，那抽血的部分阿就是，其實你的發炎指數...      0
11          12  醫師：啊最近的狀況還好嗎？民眾：還好啦。醫師：還好啦齁，一樣肚子還是不舒服。民眾：肚子會悶，...      0
12          13  醫師：我們接下來會慢慢的把那個，因為吃很多藥都吃

Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.850 seconds.
Prefix dict has been built successfully.



---------------------------------
text
 [[ 108    0    0 ...    0    0    0]
 [ 204   49    0 ...    0    0    0]
 [1007    0    0 ...    0    0    0]
 ...
 [ 126    0    0 ...    0    0    0]
 [  92    0    0 ...    0    0    0]
 [ 826    0    0 ...    0    0    0]]
(170, 52)
月[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD
還好[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PA
蛤[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD
月[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD

---------------------------------
answer:
 1.0


## Risk Classification model

In [None]:
class Risk_Classifier(nn.Module):
    def __init__(self, d_emb: int, p_drop: float, n_layers: int):
        super().__init__()
        hid = []
        self.l0 = nn.Linear(d_emb, d_emb)
        for _ in range(n_layers):
            hid.append(nn.Linear(in_features=d_emb, out_features=d_emb))
            hid.append(nn.ReLU())
            hid.append(nn.Dropout(p=p_drop))
        self.hid = nn.Sequential(*hid)
        self.l1 = nn.Linear(d_emb, d_emb//2)
        self.dropout = nn.Dropout(p_drop)
        self.l2 = nn.Linear(d_emb//2, 1)

    def forward(self, document: torch.Tensor) -> torch.Tensor:
        output = document
        output = self.l0(output)

        output = self.hid(output)
        #　Linear layer
        # Input shape: `(B, E)`
        # Ouput shape: `(B, E//2)`
        # output = F.relu(self.l1(document))
        output = F.relu(self.l1(output))

        #　Dropout
        # Input shape: `(B, E//2)`
        # Ouput shape: `(B, E//2)`
        # output = self.dropout(output)

        #　Linear layer
        # Input shape: `(B, E//2)`
        # Ouput shape: `(B, 1)`
        output = torch.sigmoid(self.l2(output))

        return output.squeeze(-1)


class risk_model(nn.Module):
    def __init__(self, embedding_path: str, d_emb: int, n_cls_layers: int, p_drop: float):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(
            np.load(embedding_path)), freeze=True, padding_idx=0)
        self.word_encoder = Encoder(d_emb, p_drop)
        self.encoder = Encoder(d_emb, p_drop)
        self.risk = Risk_Classifier(d_emb, p_drop, n_cls_layers)

    def forward(self, document):
        # Embedding layer
        # Shape: [B, `max_doc_len`, `max_sent_len`, E]
        doc = self.embedding(document)
        w_mask, s_mask = self.create_mask(document)

        # Sentence embedding
        # Input shape: [B, `max_doc_len`, `max_sent_len`, E]
        # Output shape: [B, `max_doc_len`, E]
        doc = torch.stack([ self.word_encoder(d,w) for d,w in zip(doc, w_mask)])

        # Document embedding
        # Input shape: [B, `max_doc_len`, E]
        # Output shape: [B, E]
        doc = self.encoder(doc, s_mask)

        risk_output = self.risk(doc)

        return risk_output

    @staticmethod
    def create_mask(batch_prev_tkids: torch.Tensor) -> torch.Tensor:
        # Create padding self attention masks.
        # Shape: [B, `max_doc_len`, `max_sent_len`, 1]
        # Output dtype: `torch.bool`.
        w_pad_mask = batch_prev_tkids == 0
        w_pad_mask = w_pad_mask.unsqueeze(-1)

        s_pad_mask = batch_prev_tkids.sum(dim=-1)
        s_pad_mask = s_pad_mask == 0
        s_pad_mask = s_pad_mask.unsqueeze(-1)

        return w_pad_mask, s_pad_mask

    def loss_fn(self, document, risk):
        pred_risk = self(document)
        pred_risk = pred_risk.reshape(-1)
        risk = risk.reshape(-1)
        return F.binary_cross_entropy(pred_risk, risk)


## Training Risk Classifier

In [9]:
import csv
import os
import pathlib
import re

import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score

def risk_train(model_cfg, dataset, device,  # model and datasets
               p_drop, n_epoch, batch_size, learning_rate,  # training hyper parameter
               save_step, model_path):  # saving model

    model = risk_model(**model_cfg, p_drop=p_drop).train().to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    dataldr = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    def save_model(md, step):
      if step!=-1:
        torch.save(md.state_dict(), os.path.join(model_path, f"model-{step}.pt"))
      else:
        torch.save(md.state_dict(), os.path.join(model_path, f"final_model.pt"))

    # Train loop
    step = 0
    for epoch in range(n_epoch):
        tqdm_dldr = tqdm(dataldr)

        avg_loss = 0
        for epoch_step, batch_data in enumerate(tqdm_dldr):
            optimizer.zero_grad()

            batch_document = batch_data["article"].to(device)
            batch_risk = batch_data["risk_answer"].to(device)

            loss = model.loss_fn(batch_document, batch_risk)
            loss.backward()
            optimizer.step()

            step += 1
            avg_loss += loss
            tqdm_dldr.set_description(
                f"epoch:{epoch},{step} loss:{avg_loss / (epoch_step+1):.04f}")

            # if step % save_step == 0:
                # save_model(model, step)
            if avg_loss / (epoch_step+1) < 0.4 and step % save_step == 0 and avg_loss / (epoch_step+1) >= 0.3:
                save_model(model, step)
                break

    save_model(model, -1)


random_seed = 42
# Set random states for reproducibility
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)
# Use cuda when possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Save training configuration
risk_model_path = os.path.join("exp", "risk")
pathlib.Path(risk_model_path).mkdir(parents=True, exist_ok=True)


model_cfg = {
    "embedding_path": os.path.join("data", "embeddings.npy"),
    "d_emb": 300,
    "n_cls_layers": 2,
}

dataset = dataset_risk(
    vocab_path=os.path.join("data", "vocab.json"),
    risk_file=os.path.join("data", "Train_risk_classification_ans.csv"),
)

risk_train(
    model_cfg=model_cfg,
    dataset=dataset,
    model_path=risk_model_path,
    device=device,
    # Hyperparameters
    batch_size=10,
    learning_rate=1e-4,
    n_epoch=40,
    save_step=100,
    p_drop=0.1,
)



Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.827 seconds.
Prefix dict has been built successfully.


NameError: ignored

## Test on trained model

In [13]:
from sklearn.metrics import accuracy_score

def save_result(output_path: str, data: list, ckpt: int):

  if ckpt!=-1:
    output = [["id", "label"]] + \
        [[i+1, label] for i, label in enumerate(data)]
    csv.writer(open(os.path.join(
        output_path, f"decision_{ckpt}.csv"), 'w', newline='')).writerows(output)
  else:
    if output_path.find('risk')!=-1:
      output = [["article_id", "probability"]] + \
        [[i+1, label] for i, label in enumerate(data)]
      csv.writer(open(os.path.join(
        output_path, f"decision.csv"), 'w', newline='')).writerows(output)
    elif output_path.find('qa')!=-1:
      output = [["id", "answer"]] + \
        [[i+1, 'A' if label==0 else 'B' if label==1 else 'C'] for i, label in enumerate(data)]
      csv.writer(open(os.path.join(
        output_path, f"qa.csv"), 'w', newline='')).writerows(output)
        
@torch.no_grad()
def risk_test(model_cfg, dataset, device, batch_size,
              model_path, output_path):
    dataldr = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=False)

    # Load all checkpoints
    ckpts = sorted([
        (int(ckpt.group(1)), os.path.join(risk_model_path, ckpt.group(0)))
        for ckpt in map(lambda f:re.match(r'model-(\d+).pt', f), os.listdir(model_path))
        if ckpt is not None
    ], key=lambda x: x[0])

    for step, ckpt in ckpts:
        # Model
        model = risk_model(**model_cfg, p_drop=0.0)
        model.load_state_dict(torch.load(ckpt))
        model = model.eval().to(device)

        preds = []

        for batch_data in tqdm(dataldr):
            batch_document = batch_data["article"].to(device)
            preds += model(batch_document).tolist()
        print(f"\nroc_auc {step} : {roc_auc_score(dataset.risk, preds):.04f}", flush=True)
        save_result(output_path, preds, step)    

risk_output_path = os.path.join("output", "risk")
pathlib.Path(risk_output_path).mkdir(parents=True, exist_ok=True)

print("\nevaluate on training set...", flush=True)
risk_test(
    model_cfg=model_cfg,
    dataset=dataset,
    model_path=risk_model_path,
    device=device,
    batch_size=8,
    output_path=risk_output_path,
)



evaluate on training set...


## Risk Predict

In [None]:
def risk_predict(model_cfg, dataset, device, batch_size,
              model_path, output_path):
    dataldr = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=False)
    
    model = risk_model(**model_cfg, p_drop=0.0)
    # print(os.path.join(risk_model_path,'final_model.pt'))
    # input()
    model.load_state_dict(torch.load('exp/risk/final_model.pt'))
    model = model.eval().to(device)

    preds = []

    for batch_data in tqdm(dataldr):
        batch_document = batch_data["article"].to(device)
        preds += model(batch_document).tolist()
        # print(batch_data["article"])
    save_result(output_path, preds, -1)

test_data=dataset_risk(
    vocab_path=os.path.join("data", "vocab.json"),
    risk_file=os.path.join("data", "Develop_risk_classification.csv"),
)
final_result_path=os.path.join("result", "risk")
pathlib.Path(final_result_path).mkdir(parents=True, exist_ok=True)


print("\nevaluate on test set...", flush=True)
risk_predict(
    model_cfg=model_cfg,
    dataset=test_data,
    model_path=risk_model_path,
    device=device,
    batch_size=8,
    output_path=final_result_path,
)


evaluate on test set...


100%|██████████| 13/13 [00:00<00:00, 36.16it/s]


# Question Answer

## About Question Answer dataset

In [28]:
from pprint import pprint
# from dataset import dataset_qa

qa_file=os.path.join("data", "SampleData_QA.json")

print('qa data:')
pprint(json.load(open(qa_file))[0])

dataset = dataset_qa(
    vocab_path=os.path.join("data", "vocab.json"),
    qa_file=qa_file,
)

d = next(iter(dataset))
print('\n---------------------------------')
print('encoded article')
print(d['article'])
print(d['article'].shape)
print('\n---------------------------------')
print('encoded question')
print(d['question'])
print(d['question'].shape)
print('\n---------------------------------')
print('encoded choice')
print(d['choice'])
print(d['choice'].shape)
print('\n---------------------------------')
print('answer')
print(d['qa_answer'])


qa data:
{'answer': 'C',
 'article_id': 1,
 'id': 1,
 'question': {'choices': [{'label': 'A', 'text': '有固炮'},
                          {'label': 'B', 'text': '民眾的固砲不一定會戴套'},
                          {'label': 'C', 'text': '覺得PrEP沒效'}],
              'stem': '下列關於民眾的敘述，何者有誤？'},
 'text': '個管師：這個月還好嗎？民眾：蛤？個管師：這個月還好嗎？民眾：這個月還好。個管師：還好，還可以有性行為嗎？民眾：有。個管師：所以是跟固定伴侶嗎？民眾：對。個管師：固定伴侶喔？民眾：對阿。個管師：喔，你有固定伴侶囉？民眾：本來就有，只是比較忙，所以很少約。個管師：喔，所以一直都是那一個？民眾：恩，對。個管師：诶，不對阿，你後來有用他的，然後後來扮相，後來不是不跟他，沒有跟他了嗎？民眾：用他的什麼？個管師：那個阿，他的那個，身分。吃PrEP。民眾：喔，沒有，他是我朋友。個管師：對。民眾：阿我們沒有那個。個管師：好，對。民眾：對，沒有做。個管師：好，然後所以你那時候就一直是跟現在這個嗎？民眾：對對對。個管師：12月那時候？民眾：對，阿只是之前不夠的話會跟我朋友拿。阿我朋友都剛好沒有在吃，因為他就也很少在。個管師：OK。民眾：他只要說戴套的話就不會吃那個，所以一直沒有做。個管師：喔，所以戴套跟，戴套跟PrEP你們都會二擇一？民眾：恩，他是這樣子。個管師：啊那你？民眾：阿我是可能有時候沒有戴套的話。個管師：恩。民眾：會怕的，如果對方。個管師：所以你現在都是約的喔？阿你有固炮了還約？民眾：固炮，固炮是固炮，可是我，他跟我講說他在外面沒有幹嘛，你會相信嗎？我不會相信他。個管師：所以你不相信？民眾：對阿。個管師：所以你跟這個是固炮？你固炮多久了？民眾：五、六年了吧。個管師：喔你這個這麼久了？民眾：對阿，可是很少約啦。所以有時候他會約我也都是跟他說很忙很忙。可能七、八個月約一次，可是我也不曉得說七、八個月你都不會想幹嘛阿，對阿。個管師：那這七、八個月你會想幹嘛嗎？民眾：我不會想幹嘛耶。個管師：所以你也不會主動約他，都

## Question Answer Model

In [29]:
class QA_Classifier(nn.Module):
    def __init__(self, d_emb: int, p_hid: float, n_layers: int):
        super().__init__()
        self.l1 = nn.Linear(3*d_emb, d_emb)
        self.dropout = nn.Dropout(p_hid)

        hid = []
        for _ in range(n_layers):
            hid.append(nn.Linear(in_features=d_emb, out_features=d_emb))
            hid.append(nn.ReLU())
            hid.append(nn.Dropout(p=p_hid))
        self.hid = nn.Sequential(*hid)
        self.l2 = nn.Linear(d_emb, 1)

    def forward(
        self,
        document: torch.Tensor,
        question: torch.Tensor,
        choice: torch.Tensor
    ) -> torch.Tensor:
        # Concatenates `document embedding`, `question embedding`
        # and `choice embeding`
        # Input shape: `(B, E)`, `(B, E)`, `(B, E)`
        # Ouput shape: `(B, 3*E)`
        output = torch.cat((document, question, choice), -1)

        #　Linear layer
        # Input shape: `(B, 3*E)`
        # Ouput shape: `(B, E)`
        output = F.relu(self.l1(output))

        #　Dropout
        # Input shape: `(B, E)`
        # Ouput shape: `(B, E)`
        output = self.dropout(output)

        # Hidden layer
        output = self.hid(output)

        #　Linear layer
        # Input shape: `(B, E)`
        # Ouput shape: `(B, 1)`
        output = torch.sigmoid(self.l2(output))

        return output


class qa_model(nn.Module):
    def __init__(self, embedding_path: str, d_emb: int, n_cls_layers: int, p_drop: float):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(
            np.load(embedding_path)), freeze=True, padding_idx=0)
        self.word_encoder = Encoder(d_emb, p_drop)
        self.encoder = Encoder(d_emb, p_drop)
        self.qa = QA_Classifier(d_emb, p_drop, n_cls_layers)

    def forward(self, document, question, choice):
        # Embedding layer
        # Shape: [B, `max_doc_len`, `max_sent_len`, E]
        doc = self.embedding(document)
        # Shape: [B, `max_q_len`, E]
        qst = self.embedding(question)
        # Shape: [B, 3, `max_c_len`, E]
        chs = self.embedding(choice)

        # Sentence embedding
        # Shape: [B, `max_doc_len`, E]
        w_mask, s_mask = self.create_mask(document)
        
        doc = torch.stack([self.word_encoder(d, m) for d, m in zip(doc, w_mask)])

        # Shape: [B, E]
        w_mask, _ = self.create_mask(question)
        qst = self.word_encoder(qst, w_mask)

        # Document embedding
        # Input shape: [B, `max_doc_len`, E]
        # Output shape: [B, E]
        doc = self.encoder(doc, s_mask)

        # Input Shape: [3, B, E]
        # Output Shape: [[B],[B],[B]]
        chs = chs.transpose(0, 1)
        w_mask, _ = self.create_mask(choice.transpose(0, 1))
        qa_output = [self.qa(doc, qst, self.word_encoder(ci, wmi)) for ci, wmi in zip(chs, w_mask)]
        qa_output = torch.cat(qa_output, dim=-1)
        return qa_output

    def create_mask(self, batch_prev_tkids: torch.Tensor) -> torch.Tensor:
        # Create padding self attention masks.
        # Shape: [B, `max_doc_len`, `max_sent_len`, 1]
        # Output dtype: `torch.bool`.
        w_pad_mask = batch_prev_tkids == 0
        w_pad_mask = w_pad_mask.unsqueeze(-1)

        s_pad_mask = batch_prev_tkids.sum(dim=-1)
        s_pad_mask = s_pad_mask == 0
        s_pad_mask = s_pad_mask.unsqueeze(-1)

        return w_pad_mask, s_pad_mask

    def loss_fn(self, document, question, choice, qa):
        pred_qa = self(document, question, choice)
        pred_qa = pred_qa.reshape(-1)
        qa = qa.reshape(-1)
        return F.binary_cross_entropy(pred_qa, qa)


## Training Question Answer Model

In [34]:
def qa_train(model_cfg, dataset, device,  # model and datasets
             p_drop, n_epoch, batch_size, learning_rate,  # training hyper parameter
             save_step, model_path):  # saving model

    model = qa_model(**model_cfg, p_drop=p_drop).train().to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    dataldr = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    def save_model(md, step):
      if step != -1:
        torch.save(md.state_dict(), os.path.join(
            model_path, f"model-{step}.pt"))
      else:
        torch.save(md.state_dict(), os.path.join(
            model_path, f"final_model.pt"))

    # Train loop
    step = 0
    for epoch in range(n_epoch):
        tqdm_dldr = tqdm(dataldr)

        avg_loss = 0
        for epoch_step, batch_data in enumerate(tqdm_dldr):
            optimizer.zero_grad()
            loss = model.loss_fn(
                document=torch.LongTensor(batch_data['article']).to(device),
                question=batch_data["question"].to(device),
                choice=batch_data["choice"].to(device),
                qa=batch_data["qa_answer"].float().to(device))
            loss.backward()
            optimizer.step()

            step += 1
            avg_loss += loss
            tqdm_dldr.set_description(
                f"epoch:{epoch},step:{step}, loss:{avg_loss / (epoch_step+1):.04f}")

            if avg_loss / (epoch_step+1) < 0.35 and step % save_step == 0 and avg_loss / (epoch_step+1) >= 0.25:
                save_model(model, step)
                break

    save_model(model, -1)


random_seed = 42
# Set random states for reproducibility
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)
# Use cuda when possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Save training configuration
qa_model_path = os.path.join("exp", "qa")
pathlib.Path(qa_model_path).mkdir(parents=True, exist_ok=True)

dataset = dataset_qa(
    vocab_path=os.path.join("data", "vocab.json"),
    qa_file=os.path.join("data", "Train_qa_ans.json"),
)

model_cfg = {
    "embedding_path": os.path.join("data", "embeddings.npy"),
    "d_emb": 300,
    "n_cls_layers": 2,
}

qa_train(
    model_cfg=model_cfg,
    dataset=dataset,
    model_path=qa_model_path,
    device=device,
    # Hyperparameters
    batch_size=10,
    learning_rate=1e-4,
    n_epoch=20,
    save_step=18,
    p_drop=0.0,
)



epoch:0,step:70, loss:0.6399: 100%|██████████| 70/70 [00:05<00:00, 13.58it/s]
epoch:1,step:140, loss:0.6329: 100%|██████████| 70/70 [00:05<00:00, 13.78it/s]
epoch:2,step:210, loss:0.6221: 100%|██████████| 70/70 [00:05<00:00, 13.80it/s]
epoch:3,step:280, loss:0.6100: 100%|██████████| 70/70 [00:05<00:00, 13.69it/s]
epoch:4,step:350, loss:0.6000: 100%|██████████| 70/70 [00:05<00:00, 13.67it/s]
epoch:5,step:420, loss:0.5873: 100%|██████████| 70/70 [00:05<00:00, 13.57it/s]
epoch:6,step:490, loss:0.5762: 100%|██████████| 70/70 [00:05<00:00, 13.57it/s]
epoch:7,step:560, loss:0.5621: 100%|██████████| 70/70 [00:05<00:00, 13.49it/s]
epoch:8,step:630, loss:0.5450: 100%|██████████| 70/70 [00:05<00:00, 13.48it/s]
epoch:9,step:700, loss:0.5185: 100%|██████████| 70/70 [00:05<00:00, 13.46it/s]
epoch:10,step:770, loss:0.4896: 100%|██████████| 70/70 [00:05<00:00, 13.44it/s]
epoch:11,step:840, loss:0.4625: 100%|██████████| 70/70 [00:05<00:00, 13.40it/s]
epoch:12,step:846, loss:0.3246:   6%|▌         | 4/

## Test on trained QA model

In [35]:
from sklearn.metrics import accuracy_score

def qa_test(model_cfg, dataset, device, batch_size,
            model_path, output_path):
    dataldr = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=False)

    # Load all checkpoints
    ckpts = sorted([
        (int(ckpt.group(1)), os.path.join(model_path, ckpt.group(0)))
        for ckpt in map(lambda f:re.match(r'model-(\d+).pt', f), os.listdir(model_path))
        if ckpt is not None
    ], key=lambda x: x[0])

    for step, ckpt in ckpts:
        model = qa_model(**model_cfg, p_drop=0.0)
        model.load_state_dict(torch.load(ckpt))
        model = model.eval().to(device)

        answer = []
        preds = []
        for batch_data in tqdm(dataldr):
            answer += batch_data["qa_answer"].argmax(dim=-1).tolist()
            pred_qa = model(
                document=torch.LongTensor(batch_data['article']).to(device),
                question=batch_data["question"].to(device),
                choice=batch_data["choice"].to(device))
            preds += pred_qa.argmax(dim=-1).tolist()

        print(f"\nstep {step} accuracy: {accuracy_score(answer, preds):.04f}", flush=True)
        save_result(output_path, preds, step)

qa_output_path = os.path.join("output", "qa")
pathlib.Path(qa_output_path).mkdir(parents=True, exist_ok=True)

print("\nevaluate on training set...", flush=True)
qa_test(
    model_cfg=model_cfg,
    dataset=dataset,
    model_path=qa_model_path,
    device=device,
    batch_size=8,
    output_path=qa_output_path,
)




evaluate on training set...


100%|██████████| 87/87 [00:02<00:00, 38.08it/s]


step 846 accuracy: 0.7727



100%|██████████| 87/87 [00:02<00:00, 39.96it/s]


step 918 accuracy: 0.8259



100%|██████████| 87/87 [00:02<00:00, 39.97it/s]


step 990 accuracy: 0.8849



100%|██████████| 87/87 [00:02<00:00, 40.08it/s]


step 1008 accuracy: 0.9094



100%|██████████| 87/87 [00:02<00:00, 39.89it/s]


step 1026 accuracy: 0.8993



100%|██████████| 87/87 [00:02<00:00, 39.82it/s]


step 1044 accuracy: 0.9094





RuntimeError: ignored

## QA Predict

In [37]:
def qa_predict(model_cfg, dataset, device, batch_size,
            model_path, output_path):
    dataldr = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=False)

    model = qa_model(**model_cfg, p_drop=0.0)
    model.load_state_dict(torch.load('exp/qa/final_model.pt'))
    model = model.eval().to(device)

    preds = []

    for batch_data in tqdm(dataldr):
      # answer += batch_data["qa_answer"].argmax(dim=-1).tolist()
      pred_qa = model(
          document=torch.LongTensor(batch_data['article']).to(device),
          question=batch_data["question"].to(device),
          choice=batch_data["choice"].to(device))
      preds += pred_qa.argmax(dim=-1).tolist()
    save_result(output_path, preds, -1)


test_data = dataset_qa(
    vocab_path=os.path.join("data", "vocab.json"),
    qa_file=os.path.join("data", "Develop_QA.json"),
)
final_result_path=os.path.join('result','qa')
pathlib.Path(final_result_path).mkdir(parents=True, exist_ok=True)

print("\nevaluate on test set...", flush=True)
qa_predict(
    model_cfg=model_cfg,
    dataset=test_data,
    model_path=qa_model_path,
    device=device,
    batch_size=8,
    output_path=final_result_path,
)


evaluate on test set...


100%|██████████| 24/24 [00:00<00:00, 36.23it/s]
