In [None]:
!pip install simpletransformers

In [None]:
import warnings
warnings.filterwarnings('ignore')
import logging
import pandas as pd
from simpletransformers.ner import NERModel, NERArgs
from sklearn.model_selection import KFold, GroupKFold, train_test_split, GroupShuffleSplit
import numpy as np
from sklearn.metrics import classification_report,  accuracy_score, precision_score, recall_score, f1_score

In [None]:
train_file = open("TRAININGDATA_similarity_sampled_sentences_BERT_10112022.txt","r+")
test_file = open("TESTINGDATA_07232022_BERT.txt","r+")

train_data = []
#unique_information_items_training_set = []
train_data_lines = train_file.readlines()
for line in train_data_lines:
    if line !="":
        line = line.replace("\n","")
        annotation = line.split("|")
        train_data.append(annotation)
    #unique_information_items_training_set.append(annotation[2])
print ("Number of instances in training set: ", len(train_data))
train_data = pd.DataFrame(
    train_data, columns=["sentence_id", "words", "labels"])
display(train_data.head(10))

test_data = []
#unique_information_items_test_set = []
test_data_lines = test_file.readlines()
for line in test_data_lines:
    if line != "":
        line = line.replace("\n","")
        annotation = line.split("|")
        test_data.append(annotation)
    #unique_information_items_test_set.append(annotation[2])
print ("Number of instances in testing set: ", len(test_data))
test_data = pd.DataFrame(
    test_data, columns=["sentence_id", "words", "labels"])
display(test_data.head(10))



In [None]:
unique_train_labels = set(list(train_data["labels"]))
print (len(unique_train_labels))
unique_test_labels = set(list(test_data["labels"]))
print (len(unique_test_labels))
all_unique_labels = unique_train_labels.union(unique_test_labels)
print (len(all_unique_labels))
all_unique_labels = list(all_unique_labels)
print (all_unique_labels)

In [None]:
# splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7)
# split = splitter.split(train_data, groups=train_data['sentence_id'])
# train_inds, eval_inds = next(split)

# train_df = train_data.iloc[train_inds]
# eval_df = train_data.iloc[eval_inds]
train_df = train_data
print ("Training set: ", len(train_df))
train_df = train_df[["sentence_id","words","labels"]]
display(train_df.head(3))
# print ("Evaluation set: ", len(eval_df))
# eval_df = eval_df[["sentence_id","words","labels"]]
# display(eval_df.head(3))

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Configure the model
model_args = NERArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = False
model_args.manual_seed = 4
model_args.use_multiprocessing = True
model_args.train_batch_size = 4
model_args.eval_batch_size = 4
model_args.use_auth_token=True
model_args.num_train_epochs = 20
model_args.learning_rate = 3e-5
model_args.save_steps= -1
model_args.save_model_every_epoch = False

model = NERModel("bert", "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", 
                 args=model_args,
                 labels=all_unique_labels)


In [None]:
# Train the model
model.train_model(train_df)
    

In [None]:
result, model_outputs, wrong_preds = model.eval_model(train_df)
print ("Results: ", result)

In [None]:
test_result, test_model_outputs, test_preds_labels = model.eval_model(test_data)
print (len(test_preds_labels))

In [None]:
display(test_data.head(2))
test_true_labels = test_data.groupby('sentence_id')['labels'].apply(list)
print(type(test_true_labels.tolist()))
print (len(test_true_labels))
print (len(test_true_labels[1497]))
print (len(test_preds_labels[1497]))

In [None]:
# Evaluate the model
test_true_labels_lst = []
test_pred_labels_lst = []
count = 0 
for i in range (0,len(test_true_labels)):
    sentence_true_label = test_true_labels[i]
    sentence_predict_label = test_preds_labels[i]
    if (len(sentence_true_label) == len(sentence_predict_label)):
      count +=1
      for item in sentence_true_label:
          test_true_labels_lst.append(item)
      for item in sentence_predict_label:
          test_pred_labels_lst.append(item)
print (count)          
print ("True labels in eval set: ", len(test_true_labels_lst))
print ("Predicted labels in eval set: ", len(test_pred_labels_lst)) 
unique_labels = list(set(test_true_labels_lst))
unique_labels.remove('O')

print ('Accuracy:', accuracy_score(test_true_labels_lst,test_true_labels_lst))
print ('Precision:', precision_score(test_true_labels_lst,test_pred_labels_lst, average = "micro", labels = unique_labels))
print ('Recall:', recall_score(test_true_labels_lst,test_pred_labels_lst,average = "micro", labels = unique_labels))
print ('F1 score:', f1_score(test_true_labels_lst,test_pred_labels_lst, average = "micro", labels = unique_labels))
print (classification_report(test_true_labels_lst,test_pred_labels_lst, labels=unique_labels ))


In [None]:
report = classification_report(test_true_labels_lst,test_pred_labels_lst, labels=unique_labels, output_dict=True)
report_df = pd.DataFrame(report).transpose()
file_name = "test_set_lr55_epoch20_similarity_sampling_BERT_10222022.csv"
report_df.to_csv(file_name)

textfile = open("outputs_lr55_epoch20_similarity_sampling_BERT_10222022.txt", "w")
for element in test_preds_labels:
    textfile.write(str(element) + "\n")
textfile.close()


In [None]:
textfile_true = open("true_labels_test_BERT_10222022.txt", "w")
for element in test_true_labels:
    textfile_true.write(str(element) + "\n")
textfile_true.close()

In [None]:
#Document level

test_data['sentence'] = test_data[['sentence_id','words','labels']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
test_data['word_labels'] = test_data[['sentence_id','words','labels']].groupby(['sentence_id'])['labels'].transform(lambda x: ','.join(x))

test_data = test_data[["sentence_id", "word_labels", "sentence"]]
test_data = test_data.drop_duplicates(keep = "first")
print (len(test_data))
display(test_data.head(10))

In [None]:
test_data_test = test_data
test_data_test["pred_labels"] = ""
for index, row in test_data_test.iterrows():
    sentence = row["sentence"].split(" ")
    sentence_str = ""
    for word in sentence:
        sentence_str = sentence_str + word + " "
#     print (sentence_str)
    sentence_labels = row["word_labels"]
    sentence_labels_lst = sentence_labels.split(",")
#     print ("True: ", (sentence_labels_lst), len(sentence_labels_lst))
    predictions, _ = model.predict([sentence_str])
    
    sentence_prediction = []
    predictions_list = predictions[0]
    for token in predictions_list:
        for key, value in token.items():
            prediction_token = value
            sentence_prediction.append(prediction_token)
    row["pred_labels"] = sentence_prediction    
display(test_data_test)
    

In [None]:
test_data_test.to_csv("prediction_TEST_SET_lr55_epoch20_random_similarity_sampling_BERT_10222022.csv")

In [None]:
prediction_file = "USERSTUDY_PREDICTIONS_SENTENCES_10302022.txt"
f = open(prediction_file, "r")
prediction_sentences = f.readlines()
print (len(prediction_sentences))
sentences_list = []
predictions_list = []

for sentence in prediction_sentences:
  sentence_prediction = []
  predictions, _ = model.predict([sentence])
  prediction_sentences = predictions[0]
  for token in prediction_sentences:
    for key, value in token.items():
      prediction_token = value
      sentence_prediction.append(prediction_token)
  print (sentence, sentence_prediction)
  sentences_list.append(sentence)
  predictions_list.append(sentence_prediction)

In [None]:
prediction_df = pd.DataFrame(list(zip(sentences_list, predictions_list)))
prediction_df.to_csv("prediction_df.csv")