In [None]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |▏                               | 10kB 24.0MB/s eta 0:00:01[K     |▍                               | 20kB 31.1MB/s eta 0:00:01[K     |▋                               | 30kB 21.2MB/s eta 0:00:01[K     |▉                               | 40kB 24.9MB/s eta 0:00:01[K     |█                               | 51kB 23.5MB/s eta 0:00:01[K     |█▎                              | 61kB 26.2MB/s eta 0:00:01[K     |█▌                              | 71kB 17.6MB/s eta 0:00:01[K     |█▊                              | 81kB 18.8MB/s eta 0:00:01[K     |██                              | 92kB 17.5MB/s eta 0:00:01[K     |██▏                             | 102kB 17.5MB/s eta 0:00:01[K     |██▍                             | 112kB 17.5MB/s eta 0:00:01[K     |██▋                             | 

In [None]:
import numpy as np
import pandas as pd
import os
import time
import torch

In [None]:
from transformers import BertTokenizer

model_version = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_version)
encoded_input = tokenizer("How old are you?", "I'm 6 years old")
print(encoded_input["input_ids"])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


[101, 2129, 2214, 2024, 2017, 1029, 102, 1045, 1005, 1049, 1020, 2086, 2214, 102]


In [None]:
from torch.utils.data import Dataset
 
class BertDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test", "val"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        if mode == 'train' :
            self.positive_df = pd.read_csv("/content/drive/MyDrive/IR/IR/data/Hw6/all_postive.csv").fillna("")
            self.negative_df = pd.read_csv("/content/drive/MyDrive/IR/IR/data/Hw6/BM25Top1000_negative.csv").fillna("")
            self.len = len(self.positive_df)
            self.data = 4
        elif mode == 'val':
            self.val_df = pd.read_csv("/content/drive/MyDrive/IR/IR/data/Hw6/validation_df.csv").fillna("")
            self.len = len(self.val_df)
            self.data = 1
        else:
            self.test_df = pd.read_csv("/content/drive/MyDrive/IR/IR/data/Hw6/test_df.csv").fillna("")
            self.len = len(self.test_df)
            self.data = 1
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            text_query = self.test_df.iloc[idx, 1]
            text_doc = self.test_df.iloc[idx, 3]
            label_tensor = None
        elif self.mode == "val":
            text_query = self.val_df.iloc[idx, 1]
            text_doc = self.val_df.iloc[idx, 3]
            label_tensor = None
        else:
            if idx > self.len:
              idx = 0
            positive_query = self.positive_df.iloc[idx, 0]
            positive_docs = self.positive_df.iloc[idx, 2]
            #隨機從negative中挑三篇出來
            random_docs = np.random.randint(self.negative_df.shape[0], size=3)
            negative_query = self.negative_df.iloc[random_docs,0].values
            negative_docs = self.negative_df.iloc[random_docs,2].values
            
            # 將 label 文字也轉換成索引方便轉換成 tensor
            positive_doc_insert = idx % 4
            label_tensor = torch.tensor(positive_doc_insert).unsqueeze(0)
            label_tensor = label_tensor.type(torch.LongTensor)
            
            positive_doc = [positive_query,positive_docs]
            negative_doc1 = [negative_query[0],negative_docs[0]]
            negative_doc2 = [negative_query[1],negative_docs[1]]
            negative_doc3 = [negative_query[2],negative_docs[2]]
            bert_input = [negative_doc1,negative_doc2,negative_doc3]
            bert_input.insert(positive_doc_insert,positive_doc)
        
        # test , training 時所需要的資料量不一樣大
        input_ids = torch.zeros([self.data, 512], dtype=torch.long)
        token_type_ids = torch.zeros([self.data, 512], dtype=torch.long)
        attention_mask = torch.zeros([self.data, 512], dtype=torch.long)
        
        if self.mode == 'train':
            encoded_input = tokenizer(bert_input , truncation ='longest_first',return_tensors="pt" ,padding = True)
        else:
            encoded_input = tokenizer(text_query,text_doc, truncation ='longest_first',return_tensors='pt', padding=True)
        
        bert_input_shape = list(encoded_input['input_ids'].size())
        word_size = bert_input_shape[1]
        
        input_ids[:,:word_size] = encoded_input['input_ids']
        token_type_ids[:,:word_size] = encoded_input['token_type_ids']
        attention_mask[:,:word_size] = encoded_input['attention_mask']
        
        #return (encoded_input['input_ids'] ,encoded_input['token_type_ids'] ,encoded_input['attention_mask'], label_tensor)
        if self.mode == 'train':
          return (input_ids,token_type_ids,attention_mask,label_tensor)
          #return (input_ids,attention_mask,label_tensor)
        return(input_ids,token_type_ids,attention_mask)
        #return(input_ids,attention_mask)
    
    def __len__(self):
        return self.len
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
train_data = BertDataset("train", tokenizer=tokenizer)

In [None]:
print(len(train_data))
data = train_data[0]
print(data[0])
print(data[3])
#data1 = train_data[1]
print(tokenizer.convert_ids_to_tokens(data[0][0]))
#print(tokenizer.convert_ids_to_tokens(data[0][1]))

8614
tensor([[  101, 14955, 18994,  ...,  8186,  8315,   102],
        [  101,  4675, 21156,  ...,  2005,  1996,   102],
        [  101,  2304,  4562,  ...,  1050,  1032,   102],
        [  101,  2248,  2396,  ...,  2088,  1032,   102]])
tensor([0])
['[CLS]', 'pol', '##iom', '##ye', '##lit', '##is', 'and', 'post', '-', 'pol', '##io', '[SEP]', '[', "'", 'language', ':', '<', 'f', 'p', '=', '105', '>', 'chinese', '<', '/', 'f', '>', '\\', 'n', 'article', 'type', ':', 'cs', '##o', '\\', 'n', '\\', 'n', '<', 'f', 'p', '=', '106', '>', '[', 'article', 'by', 'zhao', 'zhu', '##lian', '##g', '(', '63', '##9', '##2', '45', '##54', '53', '##28', ')', 'of', 'the', 'central', '<', '/', 'f', '>', '\\', 'n', 'china', 'teachers', '\\', "'", 'college', ',', 'edited', 'by', 'xu', 'hong', '##hai', '(', '60', '##7', '##9', '134', '##7', '318', '##9', ')', ':', '\\', 'n', '"', 'the', 'state', 'of', 'china', '\\', "'", 's', 'physically', 'challenged', 'persons', 'in', 'the', '\\', 'n', 'course', 'of', 'mod

In [None]:
testset =  BertDataset("test", tokenizer=tokenizer)
validation_set = BertDataset("val", tokenizer=tokenizer)
print(len(testset))
print(len(validation_set))

80000
20000


In [None]:
print(testset[0][0].size())
print(validation_set[0][0].size())

torch.Size([1, 512])
torch.Size([1, 512])


In [None]:
from torch.utils.data import DataLoader
TRAIN_BATCH_SIZE = 3
TEST_BATCH_SIZE = 10
testloader = DataLoader(testset, batch_size = TEST_BATCH_SIZE,drop_last=True,num_workers=2)
validationloader = DataLoader(validation_set, batch_size = TEST_BATCH_SIZE,drop_last=True,num_workers=2)
trainloader = DataLoader(train_data, batch_size = TRAIN_BATCH_SIZE,drop_last=True,num_workers=2,shuffle = True)

In [None]:
print(len(testloader))
print(len(trainloader))
print(len(validationloader))
#print(len(validationloader))

8000
2871
2000


In [None]:
data = next(iter(trainloader))

tokens_tensors = data[0]
segments_tensors = data[1]
masks_tensors = data[2]
label_ids = data[3]
print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([3, 4, 512]) 
tensor([[[  101,  4238,  1011,  ...,     0,     0,     0],
         [  101,  3919,  5949,  ...,  1010,  2143,   102],
         [  101,  8673,  4288,  ...,  8347,  1998,   102],
         [  101, 24787,  6897,  ...,  2025,  2018,   102]],

        [[  101,  5850,  1010,  ...,  2011,  1032,   102],
         [  101,  4675, 21156,  ...,  2637,  1005,   102],
         [  101,  7115, 10186,  ...,  5876,  2000,   102],
         [  101, 11268, 16281,  ...,     0,     0,     0]],

        [[  101,  7227, 14338,  ...,     0,     0,     0],
         [  101, 24787,  6897,  ...,  4650,  1010,   102],
         [  101, 13730,  2075,  ..., 18250,  2566,   102],
         [  101, 18479, 26029,  ...,  1032,  1050,   102]]])
------------------------
segments_tensors.shape = torch.Size([3, 4, 512])
tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]],

        [[0, 0, 0, 

In [None]:
data = next(iter(testloader))
print(data[0].size())
print(data[1].size())
print(data[2].size())

torch.Size([10, 1, 512])
torch.Size([10, 1, 512])
torch.Size([10, 1, 512])


In [None]:
from transformers import BertForMultipleChoice

PRETRAINED_MODEL_NAME = "/content/drive/MyDrive/IR/IR/data/Hw6/bert_v2"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
print(torch.cuda.get_device_name(0))

model = BertForMultipleChoice.from_pretrained(
    PRETRAINED_MODEL_NAME)

model = model.to(device)
# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))
print(model.config)

device: cuda:0
Tesla T4

name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=1, bias=True)
BertConfig {
  "_name_or_path": "/content/drive/MyDrive/IR/IR/data/Hw6/bert_v2",
  "architectures": [
    "BertForMultipleChoice"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "type_vocab_size": 2,
  "vocab_size": 30522
}



In [None]:
# 訓練模式
model.train(mode = True)

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
TRAIN_BATCH_SIZE = 3
EPOCHS = 2
start = time.time()
for epoch in range(EPOCHS):
    s1 = time.time()
    running_loss = 0.0
    for batch_num ,data in enumerate(trainloader):

        if batch_num % 100 == 0:
          print('now batch_num : ' + str(batch_num))
        """
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]
        """
        tokens_tensors = data[0].to(device)
        segments_tensors = data[1].to(device)
        masks_tensors = data[2].to(device)
        labels = data[3].view(TRAIN_BATCH_SIZE).to(device)
        
        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs.loss
        # backward
        loss.backward()
        optimizer.step()
        
        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    print('[epoch %d] loss: %.3f' %
          (epoch + 1, running_loss))
    s2 = time.time()
    print('this epoch costs :' +  str((s2 - s1) / 60) + 'mins')
    
end = time.time()
print('total time :' +  str((end - start) / 60) + 'mins')

now batch_num : 0
now batch_num : 100
now batch_num : 200
now batch_num : 300
now batch_num : 400
now batch_num : 500
now batch_num : 600
now batch_num : 700
now batch_num : 800
now batch_num : 900
now batch_num : 1000
now batch_num : 1100
now batch_num : 1200
now batch_num : 1300
now batch_num : 1400
now batch_num : 1500
now batch_num : 1600
now batch_num : 1700
now batch_num : 1800
now batch_num : 1900
now batch_num : 2000
now batch_num : 2100
now batch_num : 2200
now batch_num : 2300
now batch_num : 2400
now batch_num : 2500
now batch_num : 2600
now batch_num : 2700
now batch_num : 2800
[epoch 1] loss: 1216.338
this epoch costs :85.94273754755656mins
now batch_num : 0
now batch_num : 100
now batch_num : 200
now batch_num : 300
now batch_num : 400
now batch_num : 500
now batch_num : 600
now batch_num : 700
now batch_num : 800
now batch_num : 900
now batch_num : 1000
now batch_num : 1100
now batch_num : 1200
now batch_num : 1300
now batch_num : 1400
now batch_num : 1500
now batch_num 

In [None]:
model.save_pretrained("/content/drive/MyDrive/IR/IR/data/Hw6/bert_v2")

In [None]:
model.eval()
def get_predictions(model, dataloader):
    
    score = None
    with torch.no_grad():
        # 遍巡整個資料集
        for batch_num ,data in enumerate(dataloader):
            # 將所有 tensors 移到 GPU 上
            if batch_num % 200 == 0:
                print('now batch :' + str(batch_num))
            """
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            """
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors = data[0].to(device)
            segments_tensors = data[1].to(device)
            masks_tensors = data[2].to(device)
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs.logits
            logits = logits.detach().cpu().numpy()
            if score is None:
                score = logits
            else:
                score = np.concatenate((score, logits), axis=None)
            
    return score

In [None]:
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
bert_score = get_predictions(model, validationloader)

In [None]:
pip install ml_metrics

Collecting ml_metrics
  Downloading https://files.pythonhosted.org/packages/c1/e7/c31a2dd37045a0c904bee31c2dbed903d4f125a6ce980b91bae0c961abb8/ml_metrics-0.1.4.tar.gz
Building wheels for collected packages: ml-metrics
  Building wheel for ml-metrics (setup.py) ... [?25l[?25hdone
  Created wheel for ml-metrics: filename=ml_metrics-0.1.4-cp36-none-any.whl size=7849 sha256=16b7fa198fea554fa577a8fa2f1d108ee8bcec5fe7edeb77877aa3eb571d0c77
  Stored in directory: /root/.cache/pip/wheels/b3/61/2d/776be7b8a4f14c5db48c8e5451451cabc58dc6aa7ee3801163
Successfully built ml-metrics
Installing collected packages: ml-metrics
Successfully installed ml-metrics-0.1.4


In [None]:
import ml_metrics

bert_score = np.load('/content/drive/MyDrive/IR/IR/data/Hw6/scores/validation_score.npy')
queries = pd.read_csv("/content/drive/MyDrive/IR/IR/data/Hw6/train_20queries.csv").fillna("")
val_data = pd.read_csv("/content/drive/MyDrive/IR/IR/data/Hw6/validation_df.csv").fillna("")
topk = 1000
positive_docs = queries['pos_doc_ids']

alpha_arr = np.arange(0,5,0.1)
result = np.zeros(alpha_arr.shape)

for index,alpha in enumerate(alpha_arr):
  print(alpha)
  sum = 0
  for query_num in range(20):
    res = {}
    BM25_docs = val_data['relevant_docs'][topk * query_num : topk * (query_num + 1)].tolist()
    BM25_score = val_data['BM25_score'][topk * query_num : topk * (query_num + 1)].to_numpy()
    query_positive = positive_docs[query_num].split()
    new_score = BM25_score + alpha * bert_score[topk * query_num : topk * (query_num + 1)]

    for i,j in zip(BM25_docs,new_score):
        res[i] = j
    sorted_x = sorted(res.items(), key=lambda kv: kv[1],reverse = True)
    rescore_docs = []
    for doc in sorted_x:
        rescore_docs.append(doc[0])
    
    score = ml_metrics.mapk(query_positive,rescore_docs,topk)
    sum += score
  #sum /= 20
  result[index] = sum


0.0
0.1
0.2
0.30000000000000004
0.4
0.5
0.6000000000000001
0.7000000000000001
0.8
0.9
1.0
1.1
1.2000000000000002
1.3
1.4000000000000001
1.5
1.6
1.7000000000000002
1.8
1.9000000000000001
2.0
2.1
2.2
2.3000000000000003
2.4000000000000004
2.5
2.6
2.7
2.8000000000000003
2.9000000000000004
3.0
3.1
3.2
3.3000000000000003
3.4000000000000004
3.5
3.6
3.7
3.8000000000000003
3.9000000000000004
4.0
4.1000000000000005
4.2
4.3
4.4
4.5
4.6000000000000005
4.7
4.800000000000001
4.9


In [None]:
print(bert_score)
print(result)
print(alpha_arr[np.argmax(result)])

[ 2.5805001  0.5930148  1.1736164 ... -5.1854014 -2.352374  -5.742383 ]
[7.67803851 7.75981385 7.98230821 8.06469741 8.12944284 8.19857088
 8.16263364 8.11960409 8.10048649 8.18003132 8.16349967 8.16869207
 8.1237614  8.0392753  8.02806107 8.03430165 8.05758752 8.07203656
 8.05198078 7.92557411 7.92862077 7.99631964 7.98442845 7.98124033
 7.89143218 7.87725817 7.86601341 7.9044918  7.87159487 7.88008801
 7.89600454 7.82795096 7.83369022 7.8584135  7.88339822 7.91257075
 7.96558947 7.96731611 7.97364259 8.00030598 8.02946647 8.02929493
 8.05310422 8.04861403 8.08621117 8.05900091 8.04186507 8.06102167
 8.04261651 8.01422513]
0.5


In [None]:
print(len(testloader))
score = get_predictions(model, testloader)

8000
now batch :0
now batch :200
now batch :400
now batch :600
now batch :800
now batch :1000
now batch :1200
now batch :1400
now batch :1600
now batch :1800
now batch :2000
now batch :2200
now batch :2400
now batch :2600
now batch :2800
now batch :3000
now batch :3200
now batch :3400
now batch :3600
now batch :3800
now batch :4000
now batch :4200
now batch :4400
now batch :4600
now batch :4800
now batch :5000
now batch :5200
now batch :5400
now batch :5600
now batch :5800
now batch :6000
now batch :6200
now batch :6400
now batch :6600
now batch :6800
now batch :7000
now batch :7200
now batch :7400
now batch :7600
now batch :7800


In [None]:
np.save('/content/drive/MyDrive/IR/IR/data/Hw6/scores/validation_score.npy',bert_score)
np.save('/content/drive/MyDrive/IR/IR/data/Hw6/scores/result_score.npy',score)

In [None]:
result_score = np.load('/content/drive/MyDrive/IR/IR/data/Hw6/scores/result_score.npy')
print(result_score.shape)

(80000,)


In [None]:
test_data = pd.read_csv("/content/drive/MyDrive/IR/IR/data/Hw6/test_df.csv").fillna(0)
topk = 1000
alpha = 2.8
result_csv = pd.DataFrame(columns = ["query_id" , "ranked_doc_ids"])
for query_num in range(80):
    res = {}
    relevant_docs = test_data['relevant_docs'][topk * query_num : topk * (query_num + 1)].tolist()
    query_id = test_data['query_num'][query_num * topk].astype('int32')
    query_BM25_score = test_data['BM25_score'][topk * query_num : topk * (query_num + 1)].to_numpy()
    new_score = query_BM25_score + alpha * result_score[topk * query_num : topk * (query_num + 1)]
    for i,j in zip(relevant_docs,new_score):
        res[i] = j
    sorted_x = sorted(res.items(), key=lambda kv: kv[1],reverse = True)
    text = ""
    for doc in sorted_x:
        text += doc[0] + " "
    d = {"query_id": query_id ,"ranked_doc_ids": text}
    this_query_df = pd.DataFrame(data = d ,index=[0])
    result_csv = pd.concat([result_csv , this_query_df] , ignore_index = True)


In [None]:
print(result_csv.shape)

(80, 2)


In [None]:
result_csv.to_csv("/content/drive/MyDrive/IR/IR/data/Hw6/result/result_alpha2.88_v2.csv", index=False)