In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

import torch
#from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader
#from sentence_transformers import LoggingHandler
#from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
import numpy as np
import re

#from sklearn.metrics import roc_curve, auc

from tqdm.auto import tqdm
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import roc_curve, auc


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_folder = "C:\\Users\\gckc1\\My Drive\\My programs\\Sierra\\full_auto_review_data\\20240115_for_model_training"
model_folder = "C:\\Users\\gckc1\\My Drive\\My programs\\Sierra\\model_training"
review_data_folder = "C:\\Users\\gckc1\\My Drive\\My programs\\Sierra\\review_list"


In [2]:
val_file_location = "20240417_val_set.csv"
val_file = pd.read_csv(data_folder + "\\" + val_file_location)

dev_file_location = "20240417_dev_set_shuffled.csv"
dev_file = pd.read_csv(data_folder + "\\" + dev_file_location)

val_heart_file_location = "20240417_val_heart_set.csv"
val_heart_file = pd.read_csv(data_folder + "\\" + val_heart_file_location) 

val_HIV_file_location = "20240417_val_HIV_set.csv"
val_HIV_file = pd.read_csv(data_folder + "\\" + val_HIV_file_location) 

In [3]:
recode_dict = {0:0, 0.5:1, 1:2}
dev_file["label"] = dev_file["label"].replace(recode_dict)


In [4]:
dev_file['label'].value_counts()

label
0.0    224711
1.0    161074
2.0     81190
Name: count, dtype: int64

In [5]:
dev_file["obj_sel"] = dev_file["obj_sel"].str.replace(r'^\[OA\]', '', regex=True)
dev_file["tit_abs"] = dev_file["tit_abs"].str.replace(r'^ \[OA\]', '', regex=True)

val_file["obj_sel"] = val_file["obj_sel"].str.replace(r'^\[OA\]', '', regex=True)
val_file["tit_abs"] = val_file["tit_abs"].str.replace(r'^ \[OA\]', '', regex=True)

val_heart_file["obj_sel"] = val_heart_file["obj_sel"].str.replace(r'^\[OA\]', '', regex=True)
val_heart_file["tit_abs"] = val_heart_file["tit_abs"].str.replace(r'^ \[OA\]', '', regex=True)

val_HIV_file["obj_sel"] = val_HIV_file["obj_sel"].str.replace(r'^\[OA\]', '', regex=True)
val_HIV_file["tit_abs"] = val_HIV_file["tit_abs"].str.replace(r'^ \[OA\]', '', regex=True)

dev_file["obj_sel"] = dev_file["obj_sel"].str.replace(r'^\[BG\]', ' [BG] ', regex=True)
val_file["obj_sel"] = val_file["obj_sel"].str.replace(r'^\[BG\]', ' [BG] ', regex=True)
val_heart_file["obj_sel"] = val_heart_file["obj_sel"].str.replace(r'^\[BG\]', ' [BG] ', regex=True)
val_HIV_file["obj_sel"] = val_HIV_file["obj_sel"].str.replace(r'^\[BG\]', ' [BG] ', regex=True)

In [7]:
val_file["obj_sel"][0]

' [BG] Abdominal decompression was developed as a means of pain relief during labour. It has also been used for complications of pregnancy, and in healthy pregnant women in an attempt to improve fetal wellbeing and intellectual development. [OBJ] The objective of this review was to assess the effects of antenatal abdominal decompression for maternal hypertension or impaired fetal growth, on perinatal outcome. [SEL] Randomised or quasi‐randomised trials comparing abdominal decompression with no decompression in women with pre‐eclampsia and/or fetuses thought to be compromised.'

In [8]:
dev_file["obj_sel"] = "[RTI] " + dev_file["Review_Title"] + " [OBJ] " + dev_file["Objective"].fillna("") + " [SEL] " + dev_file["Selection_criteria"].fillna("")

In [9]:
dev_file["obj_sel"][0]

'[RTI] Single‐dose intravenous ketorolac for acute postoperative pain in adults [OBJ] To assess the analgesic efficacy and adverse effects of single‐dose intravenous ketorolac, compared with placebo or an active comparator, for moderate to severe postoperative pain in adults. [SEL] Randomized double‐blind trials that compared a single postoperative dose of intravenous ketorolac with placebo or another active treatment, for treating acute postoperative pain in adults following any surgery.'

In [10]:
dev_file_1 = dev_file[0:100000].copy()
dev_file_2 = dev_file[100000:200000].copy()
dev_file_3 = dev_file[200000:300000].copy()
dev_file_4 = dev_file[300000:466975].copy()
#val_file = val_file.sample(10000).copy()
#val_file = val_file.sample(5000).copy()

In [11]:
dev_set_1 = Dataset.from_pandas(dev_file_1)
dev_set_2 = Dataset.from_pandas(dev_file_2)
dev_set_3 = Dataset.from_pandas(dev_file_3)
dev_set_4 = Dataset.from_pandas(dev_file_4)

In [12]:
model_ckpt = "dmis-lab/biobert-large-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [13]:
new_tokens = ["[RIT]", "[OBJ]", "[BG]", "[SEL]", "[TIT]", "[ABS]"]
num_added_toks = tokenizer.add_tokens(new_tokens)

In [14]:
# Check if the new tokens are in the tokenizer
for token in new_tokens:
    if token in tokenizer.get_vocab():
        print(f"Token {token} is in the vocabulary.")
    else:
        print(f"Token {token} is not in the vocabulary.")

# Check the size of the new embeddings
print(f"New embedding size: {model.get_input_embeddings().num_embeddings}")

model.resize_token_embeddings(len(tokenizer))

Token [RIT] is in the vocabulary.
Token [OBJ] is in the vocabulary.
Token [BG] is in the vocabulary.
Token [SEL] is in the vocabulary.
Token [TIT] is in the vocabulary.
Token [ABS] is in the vocabulary.
New embedding size: 58996


Embedding(59002, 1024)

In [15]:
def token(df_input):
    df = Dataset.from_pandas(df_input.copy())
    for part in ["obj_sel", "tit_abs"]:
        df = df.map(
            lambda x:  tokenizer(
                x[part], max_length = 512, padding = True, truncation = True
            ), batched = True, batch_size = None
        )
        for col in ['input_ids', 'attention_mask']:
            df = df.rename_column(
                col, part+"_"+col
            )
    return(df)

In [16]:
dev_set_1 = token(dev_file_1)
dev_set_2 = token(dev_file_2)
dev_set_3 = token(dev_file_3)
dev_set_4 = token(dev_file_4)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/166975 [00:00<?, ? examples/s]

Map:   0%|          | 0/166975 [00:00<?, ? examples/s]

In [17]:
all_cols = ['label', 'obj_sel_input_ids', 'obj_sel_attention_mask', 'tit_abs_input_ids', 'tit_abs_attention_mask']


In [18]:
tmp1 = dev_set_1.to_pandas()
tmp2 = dev_set_2.to_pandas()
tmp3 = dev_set_3.to_pandas()
tmp4 = dev_set_4.to_pandas()

tmp = pd.concat([tmp1, tmp2, tmp3, tmp4], ignore_index = True)

dev_set = Dataset.from_pandas(tmp)

In [19]:
dev_set.set_format(type = 'torch', columns = all_cols)

In [20]:
batch_size = 32
loader = torch.utils.data.DataLoader(
    dev_set, batch_size=batch_size, shuffle = True)

In [21]:
def mean_pool(token_embeds, attention_mask):
    in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool
    

In [22]:
ffnn = torch.nn.Linear(1024*3, 3)
loss_func = torch.nn.CrossEntropyLoss()

In [23]:
from transformers.optimization import get_linear_schedule_with_warmup

optim = torch.optim.Adam(model.parameters(), lr = 2e-5)
total_steps = int(len(dev_set) / batch_size)
warmup_steps = 500
scheduler = get_linear_schedule_with_warmup(
    optim, num_warmup_steps=warmup_steps,
    num_training_steps=total_steps - warmup_steps
)




In [24]:
import torch.nn as nn

model = nn.DataParallel(model)
ffnn = nn.DataParallel(ffnn)

In [25]:
model = model.to(device)
ffnn = ffnn.to(device)

In [26]:
for epoch in range(1):
    model.train()
    loop = tqdm(loader, leave= True)
    for batch in loop:
        optim.zero_grad()
        inputs_ids_a = batch['obj_sel_input_ids'].to(device)
        inputs_ids_b = batch['tit_abs_input_ids'].to(device)
        attention_a = batch['obj_sel_attention_mask'].to(device)
        attention_b = batch['tit_abs_attention_mask'].to(device)
        label = batch['label'].to(device).long()
        u = model(inputs_ids_a, attention_mask = attention_a)[0]
        v = model(inputs_ids_b, attention_mask = attention_b)[0]
        u = mean_pool(u, attention_a)
        v = mean_pool(v, attention_b)
        uv = torch.sub(u, v)
        uv_abs = torch.abs(uv)
        x = torch.cat([u, v, uv_abs], dim = -1)
        x = ffnn(x)
        loss = loss_func(x, label)
        loss.backward()
        optim.step()
        scheduler.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/14593 [00:00<?, ?it/s]



In [27]:
torch.save(model, model_folder + "\\20240425_biobert_tit_obj_sel_logit\\" + '20240425_biobert_tit_obj_sel_logit.pth')
torch.save(ffnn, model_folder + "\\20240425_biobert_tit_obj_sel_logit\\" + '20240425_ffnn_tit_obj_sel_logit.pth')

In [None]:
####################################################################

In [2]:
model_ckpt = "dmis-lab/biobert-large-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


In [3]:
new_tokens = ["[RIT]", "[OBJ]", "[BG]", "[SEL]", "[TIT]", "[ABS]"]
num_added_toks = tokenizer.add_tokens(new_tokens)

In [4]:
model = torch.load(model_folder + "\\20240425_biobert_tit_obj_sel_logit\\" + '20240425_biobert_tit_obj_sel_logit.pth')
ffnn = torch.load(model_folder + "\\20240425_biobert_tit_obj_sel_logit\\" + '20240425_ffnn_tit_obj_sel_logit.pth')

In [5]:
import torch.nn as nn

loss_func = torch.nn.CrossEntropyLoss()

model = model.to(device)
ffnn = ffnn.to(device)

In [6]:
def mean_pool(token_embeds, attention_mask):
    in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

In [7]:
val_1_file_location = "20240419_val_1_rti_bg_obj_sel_output.csv"
val_1_file = pd.read_csv(data_folder + "\\" + val_1_file_location)

val_2_file_location = "20240419_val_2_rti_bg_obj_sel_output.csv"
val_2_file = pd.read_csv(data_folder + "\\" + val_2_file_location)

val_heart_file_location = "20240419_val_heart_rti_bg_obj_sel_output.csv"
val_heart_file = pd.read_csv(data_folder + "\\" + val_heart_file_location) 

val_HIV_file_location = "20240419_val_HIV_rti_bg_obj_sel_output.csv"
val_HIV_file = pd.read_csv(data_folder + "\\" + val_HIV_file_location) 

In [8]:
val_HIV_file = val_HIV_file.drop(['obj_sel_input_ids', 'token_type_ids', "obj_sel_attention_mask", "tit_abs_input_ids","tit_abs_attention_mask"], axis=1).copy()
val_heart_file = val_heart_file.drop(['obj_sel_input_ids', 'token_type_ids', "obj_sel_attention_mask", "tit_abs_input_ids","tit_abs_attention_mask"], axis=1).copy()
val_1_file = val_1_file.drop(['obj_sel_input_ids', 'token_type_ids', "obj_sel_attention_mask", "tit_abs_input_ids","tit_abs_attention_mask"], axis=1).copy()
val_2_file = val_2_file.drop(['obj_sel_input_ids', 'token_type_ids', "obj_sel_attention_mask", "tit_abs_input_ids","tit_abs_attention_mask"], axis=1).copy()

In [9]:
val_1_file["obj_sel"] = "[RTI] " + val_1_file["Review_Title"] + " [OBJ] " + val_1_file["Objective"].fillna("") + " [SEL] " + val_1_file["Selection_criteria"].fillna("")
val_2_file["obj_sel"] = "[RTI] " + val_2_file["Review_Title"] + " [OBJ] " + val_2_file["Objective"].fillna("") + " [SEL] " + val_2_file["Selection_criteria"].fillna("")
val_heart_file["obj_sel"] = "[RTI] " + val_heart_file["Review_Title"] + " [OBJ] " + val_heart_file["Objective"].fillna("") + " [SEL] " + val_heart_file["Selection_criteria"].fillna("")
val_HIV_file["obj_sel"] = "[RTI] " + val_HIV_file["Review_Title"] + " [OBJ] " + val_HIV_file["Objective"].fillna("") + " [SEL] " + val_HIV_file["Selection_criteria"].fillna("")

In [10]:
val_1_set = Dataset.from_pandas(val_1_file)
val_2_set = Dataset.from_pandas(val_2_file)
val_heart_set = Dataset.from_pandas(val_heart_file)
val_HIV_set = Dataset.from_pandas(val_HIV_file)

In [11]:
def token(df_input):
    df = Dataset.from_pandas(df_input.copy())
    for part in ["obj_sel", "tit_abs"]:
        df = df.map(
            lambda x:  tokenizer(
                x[part], max_length = 512, padding = True, truncation = True
            ), batched = True, batch_size = None
        )
        for col in ['input_ids', 'attention_mask']:
            df = df.rename_column(
                col, part+"_"+col
            )
    return(df)

In [12]:
val_HIV_set = token(val_HIV_file)
val_1_set = token(val_1_file)
val_2_set = token(val_2_file)
val_heart_set = token(val_heart_file)

Map:   0%|          | 0/4400 [00:00<?, ? examples/s]

Map:   0%|          | 0/4400 [00:00<?, ? examples/s]

Map:   0%|          | 0/26892 [00:00<?, ? examples/s]

Map:   0%|          | 0/26892 [00:00<?, ? examples/s]

Map:   0%|          | 0/27051 [00:00<?, ? examples/s]

Map:   0%|          | 0/27051 [00:00<?, ? examples/s]

Map:   0%|          | 0/19069 [00:00<?, ? examples/s]

Map:   0%|          | 0/19069 [00:00<?, ? examples/s]

In [13]:
all_cols = ['label', 'obj_sel_input_ids', 'obj_sel_attention_mask', 'tit_abs_input_ids', 'tit_abs_attention_mask']

In [14]:
val_1_set.set_format(type = 'torch', columns = all_cols)
val_2_set.set_format(type = 'torch', columns = all_cols)
val_heart_set.set_format(type = 'torch', columns = all_cols)
val_HIV_set.set_format(type = 'torch', columns = all_cols)

In [16]:
from torch.utils.data import DataLoader
val_1_loader = DataLoader(val_1_set, batch_size = 64, shuffle = False)
val_2_loader = DataLoader(val_2_set, batch_size = 64, shuffle = False)
val_heart_loader = DataLoader(val_heart_set, batch_size = 64, shuffle = False)
val_HIV_loader = DataLoader(val_HIV_set, batch_size = 64, shuffle = False)

In [17]:
def evaluate_model(model, ffnn, data_loader, device):
    model.eval()
    ffnn.eval()
    all_outputs = []
    predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave = True):
            inputs_ids_a = batch['obj_sel_input_ids'].to(device)
            inputs_ids_b = batch['tit_abs_input_ids'].to(device)
            attention_a = batch['obj_sel_attention_mask'].to(device)
            attention_b = batch['tit_abs_attention_mask'].to(device)
            labels = batch['label'].to(device).long()

            u = model(inputs_ids_a, attention_mask = attention_a)[0]
            v = model(inputs_ids_b, attention_mask = attention_b)[0]
            u = mean_pool(u, attention_a)
            v = mean_pool(v, attention_b)
            uv = torch.sub(u, v)
            uv_abs = torch.abs(uv)
            outputs = torch.cat([u, v, uv_abs], dim= -1)
            outputs = ffnn(outputs)

            _, predicted = torch.max(outputs, 1)

            predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_outputs.append(outputs.cpu().numpy())
    all_outputs = np.vstack(all_outputs)
    return predictions, all_labels, all_outputs

In [18]:
val_1_predictions, val_1_labels, val_1_outputs = evaluate_model(model, ffnn, val_1_loader, device)
val_2_predictions, val_2_labels, val_2_outputs = evaluate_model(model, ffnn, val_2_loader, device)
val_heart_predictions, val_heart_labels, val_heart_outputs = evaluate_model(model, ffnn, val_heart_loader, device)
val_HIV_predictions, val_HIV_labels, val_HIV_outputs = evaluate_model(model, ffnn, val_HIV_loader, device)


Evaluating:   0%|          | 0/421 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/423 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/298 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/69 [00:00<?, ?it/s]

In [19]:
def evaluate_prediction(df, predictions):
    df_tmp = df.copy()
    ct = pd.crosstab(df_tmp["Actual Labels"], df_tmp[predictions])
    print(ct)
    row_percentage = ct.div(ct.sum(axis = 1), axis = 0)
    print(row_percentage)

In [39]:
################# Evaluation 20240425 #########################

In [20]:
val_1_pd = val_1_set.to_pandas()
val_2_pd = val_2_set.to_pandas()
val_heart_set_pd = val_heart_set.to_pandas()
val_HIV_set_pd = val_HIV_set.to_pandas()

In [21]:
val_1_pd = val_1_pd.drop("Outputs", axis = 1)
val_2_pd = val_2_pd.drop("Outputs", axis = 1)
val_heart_set_pd = val_heart_set_pd.drop("Outputs", axis = 1)
val_HIV_set_pd = val_HIV_set_pd.drop("Outputs", axis = 1)

In [22]:
def combine_outputs(df, labels, predictions):
    predictions_df = pd.DataFrame({
        'Predictions': predictions,
    })  
    combined = pd.concat([df, predictions_df], axis = 1)
    return combined.copy()

In [23]:
val_1_pd = combine_outputs(val_1_pd, val_1_labels, val_1_predictions)
val_2_pd = combine_outputs(val_2_pd, val_2_labels, val_2_predictions)
val_heart_set_pd = combine_outputs(val_heart_set_pd, val_heart_labels, val_heart_predictions)
val_HIV_set_pd = combine_outputs(val_HIV_set_pd, val_HIV_labels, val_HIV_predictions)

In [26]:
#Only need to run it once#

val_1_pd['DOI'] = val_1_pd['Review_URL'].str.extract(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', flags=re.IGNORECASE)
val_1_pd['DOI'] = val_1_pd['DOI'].str.replace(r'/full$', '', regex=True)

val_2_pd['DOI'] = val_2_pd['Review_URL'].str.extract(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', flags=re.IGNORECASE)
val_2_pd['DOI'] = val_2_pd['DOI'].str.replace(r'/full$', '', regex=True)

#Only need to run it once#

val_heart_set_pd['DOI'] = val_heart_set_pd['Review_URL'].str.extract(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', flags=re.IGNORECASE)
val_heart_set_pd['DOI'] = val_heart_set_pd['DOI'].str.replace(r'/full$', '', regex=True)

val_HIV_set_pd['DOI'] = val_HIV_set_pd['Review_URL'].str.extract(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', flags=re.IGNORECASE)
val_HIV_set_pd['DOI'] = val_HIV_set_pd['DOI'].str.replace(r'/full$', '', regex=True)

In [27]:
review_record = pd.read_csv(review_data_folder + "\\" + "20231218_cochrane_review_expertise.csv")

In [28]:
val_1_pd["expertise"] = val_1_pd['DOI'].isin(review_record['DOI'])
val_2_pd["expertise"] = val_2_pd['DOI'].isin(review_record['DOI'])
val_heart_set_pd["expertise"] = val_heart_set_pd['DOI'].isin(review_record['DOI'])
val_HIV_set_pd["expertise"] = val_HIV_set_pd['DOI'].isin(review_record['DOI'])

In [29]:
val_1_pd.to_csv(data_folder + "\\" + "20240419_val_1_rti_obj_sel_logit.csv")
val_2_pd.to_csv(data_folder + "\\" + "20240419_val_2_rti_obj_sel_logit.csv")
val_heart_set_pd.to_csv(data_folder + "\\" + "20240419_val_heart_rti_obj_sel_logit.csv")
val_HIV_set_pd.to_csv(data_folder + "\\" + "20240419_val_HIV_rti_obj_sel_logit.csv")

In [None]:
val_1_pd = pd.read_csv(data_folder + "\\" + "20240419_val_1_rti_obj_sel_output.csv")
val_2_pd = pd.read_csv(data_folder + "\\" + "20240419_val_2_rti_obj_sel_output.csv")
val_heart_set_pd = pd.read_csv(data_folder + "\\" + "20240419_val_heart_rti_obj_sel_output.csv")
val_HIV_set_pd = pd.read_csv(data_folder + "\\" + "20240419_val_HIV_rti_obj_sel_output.csv")

In [24]:
def evaluate_prediction(df, predictions):
    df_tmp = df.copy()
    ct = pd.crosstab(df_tmp["Actual Labels"], df_tmp[predictions])
    print(ct)
    row_percentage = ct.div(ct.sum(axis = 1), axis = 0)
    print(row_percentage)

In [25]:
print("*************************************************")
print("Val 1")
print("*************************************************")
evaluate_prediction(val_1_pd, "Predictions")
print("*************************************************")
print("Val 2")
print("*************************************************")
evaluate_prediction(val_2_pd, "Predictions")
print("*************************************************")
print("Heart")
print("*************************************************")
evaluate_prediction(val_heart_set_pd, "Predictions")
print("*************************************************")
print("HIV")
print("*************************************************")
evaluate_prediction(val_HIV_set_pd, "Predictions")

*************************************************
Val 1
*************************************************
Predictions        0     1     2
Actual Labels                   
0.0            11417   747   261
0.5             1460  6809  1653
1.0              130  1983  2432
Predictions           0         1         2
Actual Labels                              
0.0            0.918873  0.060121  0.021006
0.5            0.147148  0.686253  0.166599
1.0            0.028603  0.436304  0.535094
*************************************************
Val 2
*************************************************
Predictions        0     1     2
Actual Labels                   
0.0            11670  1055   295
0.5             1270  6428  1750
1.0               92  1713  2778
Predictions           0         1         2
Actual Labels                              
0.0            0.896313  0.081029  0.022657
0.5            0.134420  0.680356  0.185224
1.0            0.020074  0.373773  0.606153
******************

In [54]:
def evaluate_cutoff(df, column, cutoff):
    df_tmp = df.copy()
    df_tmp["cutoff"] = (df_tmp[column] > cutoff)
    ct = pd.crosstab(df_tmp["Actual Labels"], df_tmp["cutoff"])
    
    print(ct)
    
    row_percentage = ct.div(ct.sum(axis = 1), axis = 0)
    
    print(row_percentage)
    


In [None]:
#2.8%: 0.425; 3% 0.442.

In [55]:
k = 0.442

In [56]:
evaluate_cutoff(val_1_pd, "Outputs", k)

cutoff         False  True 
Actual Labels              
0.0            11807    618
0.5             3004   6918
1.0              137   4408
cutoff            False     True 
Actual Labels                    
0.0            0.950262  0.049738
0.5            0.302762  0.697238
1.0            0.030143  0.969857


In [57]:
evaluate_cutoff(val_2_pd, "Outputs", k)

cutoff         False  True 
Actual Labels              
0.0            12220    800
0.5             2611   6837
1.0              178   4405
cutoff            False     True 
Actual Labels                    
0.0            0.938556  0.061444
0.5            0.276355  0.723645
1.0            0.038839  0.961161


In [58]:
evaluate_cutoff(val_heart_set_pd, "Outputs", k)

cutoff         False  True 
Actual Labels              
0.0             6540    434
0.5             2567   6862
1.0               94   2342
cutoff            False     True 
Actual Labels                    
0.0            0.937769  0.062231
0.5            0.272245  0.727755
1.0            0.038588  0.961412


In [59]:
evaluate_cutoff(val_HIV_set_pd, "Outputs", k)

cutoff         False  True 
Actual Labels              
0.0             2016    314
0.5              639    852
1.0               61    518
cutoff            False     True 
Actual Labels                    
0.0            0.865236  0.134764
0.5            0.428571  0.571429
1.0            0.105354  0.894646


In [None]:
def calculate_norm(df_relevance, output_column):
    df_tmp = df_relevance.copy()
    groups = [df1 for _, df1 in df_tmp.groupby("Objective")]
    for i in range(0, len(groups)):
        maximum = groups[i][output_column].max()
        minimum = groups[i][output_column].min()
        groups[i]["Normed relevance"] = (groups[i][output_column] - minimum)/(maximum - minimum)
        p95 = groups[i][output_column].quantile(0.95)
        p90 = groups[i][output_column].quantile(0.90)
        groups[i]["Normed relevance 95"] = (groups[i][output_column] - minimum)/(p95 - minimum)
        groups[i].loc[groups[i]["Normed relevance 95"] > 1, 'Normed relevance 95'] = 1 
        groups[i]["Normed relevance 90"] = (groups[i][output_column] - minimum)/(p90 - minimum)
        groups[i].loc[groups[i]["Normed relevance 90"] > 1, 'Normed relevance 90'] = 1
    df_norm = pd.concat(groups, ignore_index = True)
    return df_norm.copy()
    

In [None]:
val_1_pd_n9095 = calculate_norm(val_1_pd, "Outputs")
val_2_pd_n9095 = calculate_norm(val_2_pd, "Outputs")
val_heart_pd_n9095 = calculate_norm(val_heart_set_pd, "Outputs")
val_HIV_pd_n9095 = calculate_norm(val_HIV_set_pd, "Outputs")

In [None]:
len(val_heart_set_pd)

In [None]:
print("******************************************")
print("Val 1")
print("******************************************")
roc_analysis(val_1_pd_n9095, "Normed relevance", save = True)
print("******************************************")
print("Val 2")
print("******************************************")
roc_analysis(val_2_pd_n9095, "Normed relevance")
print("******************************************")
print("Val Heart")
print("******************************************")
roc_analysis(val_heart_pd_n9095, "Normed relevance")
print("******************************************")
print("Val HIV")
print("******************************************")
roc_analysis(val_HIV_pd_n9095, "Normed relevance")

In [None]:
evaluate_cutoff(val_1_pd_n9095, "Normed relevance", 0.566)

In [None]:
evaluate_cutoff(val_2_pd_n9095, "Normed relevance", 0.566)

In [None]:
evaluate_cutoff(val_heart_pd_n9095, "Normed relevance", 0.566)

In [None]:
evaluate_cutoff(val_HIV_pd_n9095, "Normed relevance", 0.566)