In [None]:
import pandas as pd
from utils import tool, indexing
import numpy as np

In [None]:
%load_ext autoreload
%autoreload 2

## Load Dataset

In [None]:
file_path = "static/data SOAP indo.xlsx"
rekmed_xl = pd.read_excel(file_path)

In [None]:
rekmed_xl.isnull().sum()

## Preprocessing

In [None]:
rekmed_xl = tool.drop_unnecessary_samples(rekmed_xl)
rekmed_xl = tool.remove_html_tag(rekmed_xl)

In [None]:
rekmed_xl.isnull().sum()

In [None]:
# rekmed_xl.reset_index(drop=False)
rekmed_xl

In [None]:
# save preprocessing data
tool.save_py_obj("rekmed_post_preprocess.pkl", rekmed_xl)

In [None]:
# load previous clean data
rekmed_xl_pkl = tool.load_py_obj("rekmed_post_preprocess.pkl")

In [None]:
# len(rekmed_xl_pkl)
len(rekmed_xl)
# 126483 -> 114058

## Generate Question & Answer

In [None]:
# sampling
sample_rekmed = rekmed_xl_pkl[:100]
# sample_rekmed
print(f"total samples = {len(sample_rekmed)}")
rekmed_l_tokens = indexing.tokenization(sample_rekmed.diagnosa)
sample_rekmed, rekmed_l_tokens = tool.drop_meaningless_tokens(sample_rekmed, rekmed_l_tokens)
print(f"total samples after dropping some meaningless tokens = {len(sample_rekmed)}")

In [None]:
QA = tool.qa_generator(sample_rekmed)

In [None]:
QA.head(4)

In [None]:
QA = tool.grouping_qa(QA)

In [None]:
QA

In [None]:
label_entailment = tool.gather_entailment(QA) # 0.8

In [None]:
label_neutral = tool.gather_neutral(QA) # 0.4

In [None]:
label_contradiction = tool.gather_contradiction(QA) # 0

In [None]:
print(f"Total samples of QA : {len(QA)}")
print(f"entailment : {len(label_entailment)}")
print(f"neutral : {len(label_neutral)}")
print(f"contradiction : {len(label_contradiction)}")

In [None]:
train_examples = tool.concatenate_train_examples(
    label_entailment, label_neutral, label_contradiction
)

In [None]:
model_name = "microsoft/mpnet-base"
sts_model = tool.STSModel(model_name, local=False)

In [None]:
# # prepare training & validation samples
sts_model.get_train_test_dataset(train_examples)

In [None]:
sts_model.fit()

In [None]:
model_ft = "output/models/microsoft/mpnet-base-2022-11-23_03-50-13-mean-sts"
loadFtModel = tool.LoadFTModel(model_ft)

In [None]:
qa_pairs = QA.loc[:, "question":"answer"]

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel

model_ckpt = "output/models/microsoft/mpnet-base-2022-11-23_03-50-13-mean-sts"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

import torch

device = torch.device("cuda")
model.to(device)

def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

qa_pairs = Dataset.from_pandas(qa_pairs)

In [None]:
embeddings_dataset = qa_pairs.map(
    lambda x: {"embeddings": get_embeddings(x["question"]).detach().cpu().numpy()[0]}
)

embeddings_dataset.add_faiss_index(column="embeddings")

In [None]:
# question = "terapi untuk penyakit bph adalah"
question = "Dermatitis Atopik dapat diobati dengan"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=4
)

In [None]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

for _, row in samples_df.iterrows():
    print(f"Question : {row.question}")
    print(f"Answer : {row.answer}")
    print(f"SCORE: {row.scores}")
    print("=" * 50)
    print()