# MonoBERT

In [1]:
import pandas as pd
import json
from tqdm import tqdm
import pickle
import torch

In [2]:
queries = pd.read_csv("../input/qrels2021.txt", sep = " ", header = None, names = ["qid", "iter", "doc_id", "rel"])

In [3]:
queries.drop("iter", axis = 1, inplace = True)

In [4]:
uniq_docs = queries.doc_id.unique().tolist()

In [9]:
with open("../input/trec-ct-json/trec21_content.json", "r") as f:
    docs = json.load(f) 

doc_dict = {}
for doc in tqdm(docs, total = len(docs)):
    if doc["id"] in uniq_docs:
        doc_dict[doc["id"]] = doc["contents"]

# with open("../input/trec-ct-json/trec2021.pk", "wb") as f:
#     pickle.dump(doc_dict, f)

100%|█████████████████████████| 375580/375580 [01:21<00:00, 4603.11it/s]


In [10]:
# with open("../input/trec-ct-json/trec2021.pk", "rb") as f:
#     trec21_docs = pickle.load(f)

# trec21_docs

data = {
    'doc_id': doc_dict.keys(),
    'k': doc_dict.values()
       } 

docs21 = pd.DataFrame.from_dict(data)

docs21.to_csv("../input/trec-ct-json/trec21-train.csv", index = False)

In [8]:
docs21 = pd.read_csv("../input/trec-ct-json/trec21.csv", index_col = 0).to_dict()["summary"]

In [10]:
df = pd.read_csv("../input/qrels2021.csv")

In [15]:
train_df = df[df.qid < 60]
test_df = df[df.qid >= 60]

In [19]:
train_df.rel.values

array([1, 1, 0, ..., 2, 2, 2])

In [35]:
label_weights = train_df.rel.value_counts()/train_df.shape[0]
torch.tensor(label_weights.values)

tensor([0.6706, 0.1688, 0.1606], dtype=torch.float64)

In [40]:
queries = pd.read_csv(
    "../input/ctqueries2021.tsv", sep="\t", index_col = 0, index = ["topic"]
)
queries

Unnamed: 0_level_0,"Patient is a 45-year-old man with a history of anaplastic astrocytoma of the spine complicated by severe lower extremity weakness and urinary retention s/p Foley catheter, high-dose steroids, hypertension, and chronic pain. The tumor is located in the T-L spine, unresectable anaplastic astrocytoma s/p radiation. Complicated by progressive lower extremity weakness and urinary retention. Patient initially presented with RLE weakness where his right knee gave out with difficulty walking and right anterior thigh numbness. MRI showed a spinal cord conus mass which was biopsied and found to be anaplastic astrocytoma. Therapy included field radiation t10-l1 followed by 11 cycles of temozolomide 7 days on and 7 days off. This was followed by CPT-11 Weekly x4 with Avastin Q2 weeks/ 2 weeks rest and repeat cycle."
1,Unnamed: 1_level_1
2,"48 M with a h/o HTN hyperlipidemia, bicuspid ..."
3,A 32 yo woman who presents following a severe...
4,This is a 44 year old female with PMH of PCOS...
5,"74M hx of CAD s/p CABG, EF 60% prior CVA (no ..."
6,Patient is a 55yo woman with h/o ESRD on HD a...
...,...
71,The patient is a 34-year-old obese woman who ...
72,The patient is a 16-year-old girl recently di...
73,The patient is a 3-day-old female infant with...
74,The patient is a 53-year-old man complaining ...


In [1]:
import torch.nn as nn
import torch

In [14]:
criterion = nn.CrossEntropyLoss(weight = torch.tensor([0.3, 0.2, 0.5], dtype = torch.float64).to('cuda'), reduction = "mean")

In [15]:
a = torch.tensor([[0.1, 0.2, 0.7], [0.2, 0.1, 0.7]], dtype = torch.float64).to('cuda')
b = torch.tensor([2, 2], dtype = torch.long).to('cuda')

In [16]:
criterion(a, b)

tensor(0.7679, device='cuda:0', dtype=torch.float64)

In [12]:
from transformers.models.bert.tokenization_bert import BertTokenizer 

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [29]:
enc = tokenizer.encode("Hello", "there", truncation="only_first")
tokenizer.decode(enc)

'[CLS] hello [SEP] there [SEP]'

# Visualize

In [1]:
import json
from tqdm import trange
with open("../input/trec-ct-json/trec_cl.jsonl", "r") as f:
    docs = [json.loads(doc) for doc in f.read().splitlines()]

In [2]:
cleaned_docs = []
for idx in trange(len(docs)):
    cleaned_docs.append({
        "id": docs[idx]["id"],
        "contents": docs[idx]["contents"]
    })

with open("../input/trec-ct-json/trec21_content.json", "w") as f:
    json.dump(cleaned_docs, f)

100%|██████████| 375580/375580 [00:00<00:00, 1639927.89it/s]


In [9]:
with open("../input/ctqueries2021.tsv", "r") as f:
    queries = f.read().splitlines()

In [10]:
queries = [queries[idx].split("\t")[1].strip()+'\n' for idx in range(len(queries))]

In [11]:
with open("../input/ctqueries2021.txt", "w") as f:
    f.writelines(queries)

# ElasticSearch

In [3]:
from elasticsearch import Elasticsearch
import numpy as np
from tqdm import trange
from glob import glob
from timeit import default_timer as timer
import json


class InvertedIndex:
    def __init__(self, user, passwd, index_name):
        self.user = user
        self.passwd = passwd
        self.index_name = index_name
        self._initialise_index()
        self._create_index()

    def _close_index(self):
        self.client.indices.close(index=self.index_name)

    def _open_index(self):
        self.client.indices.open(index=self.index_name)

    def _initialise_index(self):
        self.client = Elasticsearch(
            "http://localhost:9200", basic_auth=(self.user, self.passwd)
        )

    def _delete_index(self):
        self._close_index()
        self.client.indices.delete(index=self.index_name)

    def _create_index(self):
        if not self.client.indices.exists(index=self.index_name):
            self.client.indices.create(index=self.index_name)

        self._close_index()
        # create the inverted index
        self.client.indices.put_settings(
            settings={
                "index": {
                    "analysis": {
                        "analyzer": {
                            "my_analyzer": {
                                "type": "custom",
                                "tokenizer": "standard",
                                "filter": ["lowercase", "stop", "snowball"],
                            }
                        }
                    }
                }
            },
            index=self.index_name,
        )

        self.client.indices.put_mapping(
            index=self.index_name,
            properties={
                "contents": {"type": "text", "analyzer": "my_analyzer"},
            },
        )

        self._open_index()

    def _add_docs(self, path):
        # Get some documents
        with open(path, "r") as f:
            docs = json.load(f)

        # Add some documents
        for idx in trange(len(docs)):
            self.client.index(index=self.index_name, id=docs[idx]["id"], document={"contents": docs[idx]["contents"]})

    def _search_index(self, match_text):
        results = self.client.search(
            index=self.index_name,
            query={"match": 
                   {"contents": 
                    {"query": match_text, "fuzziness": 2}
                   }
                  },
            size=1000,
        )

        return results["hits"]


def search_el(DOCS_FILE, QUERY_FILE, WRITE_FILE):
    index = InvertedIndex("elastic", "123456", "ir")
    index._add_docs(DOCS_FILE)
    with open(QUERY_FILE, "r") as f:
        queries = f.readlines()

    start = timer()
    with open(WRITE_FILE, "w") as f:
        for idx, query in enumerate(queries):
            results = index._search_index(query)
            # Find the minimum and maximum scores in the results
            min_score = min(hit["_score"] for hit in results["hits"])
            max_score = max(hit["_score"] for hit in results["hits"])
            for res in results["hits"]:
                original_score = res["_score"]
                normalized_score = (original_score - min_score) / (max_score - min_score)
                f.write(f"{idx} Q0 {res['_id']} {idx+1} {normalized_score} BM25\n")
            
    print(f"Elapsed Time: {timer() - start:.3f}")
    index._delete_index()

if __name__ == "__main__":
    DOCS_FILE = "../input/trec-ct-json/trec21_content.json"
    QUERY_FILE = "../input/ctqueries2021.txt"
    WRITE_FILE = "../input/BM25_test.txt"
    search_el(DOCS_FILE, QUERY_FILE, WRITE_FILE)

100%|██████████| 10000/10000 [00:40<00:00, 245.93it/s]


Elapsed Time: 20.069


In [3]:
# index = InvertedIndex("elastic", "123456", "ir")
# index._delete_index()