## Install dependencies

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install jiwer

## Import dependencies

In [2]:
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForSeq2SeqLM, BartTokenizer, BartForConditionalGeneration
import datasets

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [3]:
def error_correct(text, model_name):


  # model = AutoModelForSeq2SeqLM.from_pretrained(model_name) #T5
  # tokenizer = AutoTokenizer.from_pretrained(model_name) #T5

  model = BartForConditionalGeneration.from_pretrained(model_name) #BART
  tokenizer = BartTokenizer.from_pretrained(model_name) #BART


  input_ids = tokenizer.encode(text, return_tensors='pt', add_special_tokens=True)

  generated_ids = model.generate(input_ids=input_ids, 
                                 num_return_sequences=5, 
                                 num_beams=5, 
                                 max_length=512, 
                                 no_repeat_ngram_size=2, 
                                 repetition_penalty=3.5, 
                                 length_penalty=1.0, 
                                 early_stopping=True
                                 )

  preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

  return preds


## Trained Models
#### Please note that all the trained model are currently available in Huggingface_hub and due to the anonimity requirement we will share the trained models in the following Google Drive link: https://drive.google.com/drive/folders/1uowiKAgk3DW48QumeCXEoDzuijN8iqTV?usp=sharing

#### Tip for reproducing: extract the models in Google Drive to the models directory in the codebase

In [None]:
#base models - literature
bart_base = 'facebook/bart-base'
bart_large = 'facebook/bart-large'

'''BART fine-tuned models'''

# Standard Objective
bart_clinical = '../models/bart-finetuned-pubmed'
bart_pubmed = '../models/bart-paraphrase-pubmed-1.1'
bart_mlm = '../models/bart-mlm-pubmed'

# Hybrid Objective
bart_mlm_paraphrasing = '../models/bart-mlm-paraphrasing'
bart_paraphrasing_mlm = '../models/bart-paraphrasing-mlm'

# Domain-specific Objective
bart_med_term = '../models/bart-mlm-pubmed-medterm'
bart_cm = '../models/bart-med-term-conditional-masking'
bart_cm_0 = '../models/bart-med-term-conditional-masking-0'


'''T5 fine-tuned models'''

# Standard Objective
t5_clinical = '../models/t5-small-finetuned-pubmed'
t5_pubmed = '../models/t5-small-paraphrase-pubmed'
t5_mlm = '../models/t5-small-mlm-pubmed'

# Hybrid Objective
t5_mlm_paraphrasing = '../models/t5-small-mlm-paraphrasing'
t5_small_paraphrasing_mlm = '../models/t5-small-paraphrasing-mlm'

# Domain-specific Objective
t5_small_med_term_mlm = '../models/t5-small-med-term-mlm'
t5_small_cm = '../models/t5-small-med-term-conditional-masking'
t5_small_cm_0 = '../models/t5-small-med-term-conditional-masking-0'


## Test scenarios

In [None]:
#testing sentences from ASR outputs
ref_1 = 'Have you noticed any changes in your weight?'
trans_1 =  'Have you noticed any changes in your wit?' # expected: Have you noticed any changes in your weight?

ref_2 = 'And I know that youve been on fortnightly Adalimumab'
trans_2 =  'Andi I know that youve been on fortnightly Adelaida map' #expected: And I know that youve been on fortnightly Adalimumab

ref_3 = 'Okay have you noticed any mucus in your bowel motions?'
trans_3 =  'Okay Have you noticed any mucus in your bible Moshe?' #expected: Okay have you noticed any mucus in your bowel motions?

## Predictions

In [None]:
def generate_predictions(ref, tran, model):
  preds = error_correct(tran, model)

  print ('\033[1m'+'Gold Reference: '+ '\033[0m')   
  print(ref)

  print ('\033[1m'+'ASR Output: '+ '\033[0m')   
  print(tran)

  print ('\033[1m'+'Language Model Output:'+ '\033[0m')
  for pred in preds:
    print(pred)

In [None]:
#T5-paraphrasing-to-masking 
generate_predictions(ref_2, trans_2, t5_small_paraphrasing_mlm)
print()