## Pipeline Variant 1

In [None]:
!pip install transformers -q
!pip install --upgrade accelerate -q
!pip install datasets evaluate -q
!pip install sacrebleu rouge_score jiwer -q
!pip install --upgrade accelerate -q
!pip install wandb -Uqq
!pip install openai

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [3]:
import numpy as np
import pandas as pd
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Create Pipeline

In [5]:
from google.colab import files

In [7]:
!cp /utils/cf_custom_functions.py /content

In [8]:
import os
import re
import json
import pandas as pd
import cf_custom_functions as cf
pd.set_option('display.max_colwidth', None)

### Configuration class

In [32]:
class cfc:

  pred_dir = "/content/drive/MyDrive/predictions/"
  pipeline_var_dir = "/content/drive/MyDrive/Experimente/PIPELINE/VAR1_LaTeX-OCR_NLG/"

  latex_ocr_file_path = "/content/drive/MyDrive/predictions/LatexOCR/LaTexOCR_image-to-latex-fine-tuned_predictions.csv"
  metrics_file_path = "/content/drive/MyDrive/Experimente/PIPELINE/VAR1_LaTeX-OCR_NLG/metrics_ocr_nlg.json"


### Functions

In [10]:
def remove_punctuation(latex_code):
    pattern = r"(\\,\.|\\,|~|\.|\s)*,?$"
    cleaned_code = re.sub(pattern, '', latex_code, flags=re.MULTILINE)
    return cleaned_code

In [11]:
def pre_processing_OCR(df_ocr:pd.DataFrame)-> pd.DataFrame:

  df = df_ocr.copy()
  clean_preds = []

  for i, row in df.iterrows():
    cl_row = remove_punctuation(row["prediciton"])
    clean_preds.append(cl_row)

  df["input_nlg"] = clean_preds
  return df

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')


def tokenize_prediction(prediction:str)-> str:
  input_string = prediction
  pattern = r"\\[a-zA-Z]+|[\w]+|[+\-]|\(|\)|\{|\}|\^|\||\[|\]|<|>|/"
  tokens = re.findall(pattern, input_string)
  result_string = ' '.join(tokens)
  result = result_string.strip()
  return result

In [27]:
def postprocess_latex_predictions(df_preds:pd.DataFrame,df_column:str) -> pd.DataFrame:
  df = df_preds.copy()
  col = df_column
  tokenized_preds = []
  for i, entry in df.iterrows():
    tok_pred = tokenize_prediction(entry[col])
    tokenized_preds.append(tok_pred)

  df["input_nlg"] = tokenized_preds
  return df

In [14]:
def generate_NLG_predictions_from_OCR(test_data:pd.DataFrame, model: object, tokenizer: object,input_column:str,output_column:str,prompt:str)-> pd.DataFrame:

    df = test_data.copy()
    input_col = input_column
    output_col = output_column
    y_preds = []

    for index, row in df.iterrows():
        input = prompt+row[input_col]
        encoded_input = tokenizer(input, truncation=True, return_tensors="pt")
        output = model.generate(**encoded_input, max_length=50)
        decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)
        y_preds.append(decoded_output)

    y_preds = np.array(y_preds)
    df[output_col] = y_preds

    return df

In [15]:
def post_processing_multi_predictions_OCR(df_preds:pd.DataFrame, pp_column:str, pp_out_column:str)-> pd.DataFrame:

  df = df_preds.copy()
  col = pp_column
  out_col = pp_out_column
  clean_predictions = []

  for i, row in df.iterrows():
    text = row[col]
    # 1. lower case
    text_lower = text.lower()
    # 2. translate -,+ to minus or plus
    translated_text = re.sub(r' - | \+ ', cf.translate_sign, text_lower)
    # 3. translate numbers to text
    cleaned_text = cf.translate_numbers_to_text(translated_text)
    # 4. remove punctations
    cleaned_text = re.sub(r'[^\w\s]', ' ', cleaned_text)
    # 5. remove spaces
    clean_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    clean_predictions.append(clean_text)

  df[out_col] = clean_predictions
  return df

In [16]:
def generate_GPT_predictions(test_data:pd.DataFrame, model: object, tokenizer: object, input_col:str, output_col:str, prompt:str)-> pd.DataFrame:
    """Generate predictions of testdata

    Arguments:
        test_data {pd.DataFrame} -- Testdata as DataFrame
        model {object} -- Model
        tokenizer {object} -- Tokenizer
        prompt {str} -- "translate Latex to english:"

    Returns:
        pd.DataFrame -- DataFrame with Testdata and predictions
    """
    df = test_data.copy()
    input_col = input_col
    output_col = output_col
    y_preds = []

    for index, row in df.iterrows():
        input = prompt+row[input_col]
        encoded_input = tokenizer(input, return_tensors="pt")
        output = model.generate(**encoded_input, max_length=50)
        decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)
        gen_text = decoded_output[0].replace(input, "")  # Remove the input prompt from the output
        y_preds.append(gen_text)

    y_preds = np.array(y_preds)
    df[output_col] = y_preds
    return df

### Load OCR predicitons

In [38]:
df_latex_ocr_pt = cf.load_predictions_from_file("/content/drive/MyDrive/predictions/LatexOCR/LaTeXOCR_image-to-latex-pre-trained_predictions.csv")

In [None]:
cf.compute_OCR_evaluation_metrics(df_latex_ocr_pt,"prediciton")

In [None]:
# Preprocessing and tokenize the predictions
df_latex_ocr_pt_post = postprocess_latex_predictions(df_latex_ocr_pt, "prediciton")
cf.compute_OCR_evaluation_metrics(df_latex_ocr_pt_post,"input_nlg")

In [None]:
# Load predictions from best ocr model
df_latex_ocr = cf.load_predictions_from_file(cfc.latex_ocr_file_path)

In [None]:
cf.compute_OCR_evaluation_metrics(df_latex_ocr,"prediciton")

In [None]:
# Preprocessing and tokenize the predictions
df_input_nlg = postprocess_latex_predictions(df_latex_ocr, "prediciton")

In [None]:
metrics_latexocr = cf.compute_OCR_evaluation_metrics(df_input_nlg,"input_nlg")
print(metrics_latexocr)

In [None]:
cf.save_evaluation_metrics("LatexOCR",metrics_latexocr,cfc.metrics_file_path)

### Loading Testset for computing the truth

In [None]:
# Load Testset for computing the truth
df_test = pd.read_json("/data/datafiles/test_data.json")

## Natural Language Generation Models

### Bard-Base

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint ="facebook/bart-base"
bard_model_dir = "/content/drive/MyDrive/models/NLG_models/Bart-Base_latex-to-text_finetuned-hyper"

bard_model_name = "Bard-Base_OCR_to_text"
bard_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
bard_model = AutoModelForSeq2SeqLM.from_pretrained(bard_model_dir)

In [None]:
df_preds_bard = generate_NLG_predictions_from_OCR(df_input_nlg, bard_model, bard_tokenizer,"input_nlg","pred_from_ocr","")
df_preds_bard_pp = post_processing_multi_predictions_OCR(df_preds_bard,"pred_from_ocr","clean_pred_from_ocr")
df_preds_bard_pp.to_csv(f"/content/drive/MyDrive/Experimente/PIPELINE/VAR1_LaTeX-OCR_NLG/{bard_model_name}_prediction.csv")

In [None]:
metrics_bard = cf.compute_evaluation_metrics(df_preds_bard_pp,"clean_pred_from_ocr")
cf.save_evaluation_metrics(bard_model_name,metrics_bard,cfc.metrics_file_path)

#### Test compute the truth

In [None]:
df_preds_bard_truth = generate_NLG_predictions_from_OCR(df_test, bard_model, bard_tokenizer,"formula","pred_from_truth","")
df_preds_bard_truth_pp = post_processing_multi_predictions_OCR(df_preds_bard_truth,"pred_from_truth","clean_pred_from_truth")
metrics_bard_truth = cf.compute_evaluation_metrics(df_preds_bard_truth_pp,"clean_pred_from_truth")
cf.save_evaluation_metrics(f"{bard_model_name}-truth",metrics_bard_truth,cfc.metrics_file_path)

### T5-Base

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

t5_checkpoint = "t5-base"
t5_model_dir = "/content/drive/MyDrive/models/NLG_models/T5-Base-finetuned-latex-to-text-hypertuned"
t5_model_name = "T5-Base_OCR_to_text"

t5_tokenizer = AutoTokenizer.from_pretrained(t5_checkpoint)
t5_model = AutoModelForSeq2SeqLM.from_pretrained(t5_model_dir)

In [None]:
df_preds_t5 = generate_NLG_predictions_from_OCR(df_input_nlg, t5_model, t5_tokenizer,"input_nlg","pred_from_ocr","translate Latex to Text: ")
df_preds_t5_pp = post_processing_multi_predictions_OCR(df_preds_t5,"pred_from_ocr","clean_pred_from_ocr")
df_preds_t5_pp.to_csv(f"/content/drive/MyDrive/Experimente/PIPELINE/VAR1_LaTeX-OCR_NLG/{t5_model_name}_prediction.csv")
metrics_t5 = cf.compute_evaluation_metrics(df_preds_t5_pp,"clean_pred_from_ocr")
cf.save_evaluation_metrics(t5_model_name,metrics_t5,cfc.metrics_file_path)

In [None]:
df_preds_t5_truth = generate_NLG_predictions_from_OCR(df_test, t5_model, t5_tokenizer,"formula","pred_from_truth","translate Latex to Text: ")
df_preds_t5_truth_pp = post_processing_multi_predictions_OCR(df_preds_t5_truth,"pred_from_truth","clean_pred_from_truth")
metrics_t5_truth = cf.compute_evaluation_metrics(df_preds_t5_truth_pp,"clean_pred_from_truth")
cf.save_evaluation_metrics(f"{t5_model_name}-truth",metrics_t5_truth,cfc.metrics_file_path)

### FLAN-T5-Base

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

flant5_checkpoint = "google/flan-t5-base"
flant5_model_dir = "/content/drive/MyDrive/models/NLG_models/FLAN-T5-finetuned-latex-to-text_hypertuned"
flant5_model_name = "FLAN-T5-Base_OCR_to_text"

flant5_tokenizer = AutoTokenizer.from_pretrained(flant5_checkpoint)
flant5_model = AutoModelForSeq2SeqLM.from_pretrained(flant5_model_dir)

In [None]:
df_preds_flant5 = generate_NLG_predictions_from_OCR(df_input_nlg, flant5_model, flant5_tokenizer,"input_nlg","pred_from_ocr","translate Latex to Text: ")
df_preds_flant5_pp = post_processing_multi_predictions_OCR(df_preds_flant5,"pred_from_ocr","clean_pred_from_ocr")
df_preds_flant5_pp.to_csv(f"/content/drive/MyDrive/Experimente/PIPELINE/VAR1_LaTeX-OCR_NLG/{flant5_model_name}_prediction.csv")
metrics_flant5 = cf.compute_evaluation_metrics(df_preds_flant5_pp,"clean_pred_from_ocr")
cf.save_evaluation_metrics(flant5_model_name,metrics_flant5,cfc.metrics_file_path)

#### Check the truth

In [None]:
df_preds_flant5_truth = generate_NLG_predictions_from_OCR(df_test, flant5_model, flant5_tokenizer,"formula","pred_from_truth","translate Latex to Text: ")
df_preds_flant5_truth_pp = post_processing_multi_predictions_OCR(df_preds_flant5_truth,"pred_from_truth","clean_pred_from_truth")
metrics_flant5_truth = cf.compute_evaluation_metrics(df_preds_flant5_truth_pp,"clean_pred_from_truth")
cf.save_evaluation_metrics(f"{flant5_model_name}-truth",metrics_flant5_truth,cfc.metrics_file_path)

### ChatGPT

In [67]:
import openai

# Set up OpenAI API credentials and parameters for GPT-3
openai.api_key = "xxx"
parameters = {"engine": "text-davinci-003", "temperature": 0.2, "max_tokens": 256}

def get_transcription(formula: str) -> list:
    formula_text = formula
    prompt = "Transcript the formula in spoken text:"
    prompt_text = prompt + " " + formula_text
    parameters["prompt"] = prompt_text
    response = openai.Completion.create(**parameters)
    spoken_text = response.choices[0].text.strip().split("\n")
    return spoken_text

In [68]:
def create_transcriptions_OCR(df,input_col, output_col):
  df = df.copy()
  in_col = input_col
  out_col = output_col
  y_preds = []
  
  for index, row in df.iterrows():
      formula = row[in_col]
      transcription = str(get_transcription(formula))
      y_preds.append(transcription)
  df[out_col] = y_preds
  return df

In [None]:
df_chatgpt = create_transcriptions_OCR(df_input_nlg,"input_nlg","pred_from_ocr")
df_chatgpt_pp = post_processing_multi_predictions_OCR(df_chatgpt,"pred_from_ocr","clean_pred_from_ocr")
df_chatgpt_pp.to_csv(f"/content/drive/MyDrive/Experimente/PIPELINE/VAR1_LaTeX-OCR_NLG/ChatGPT_OCR-to-text_prediction.csv")
metrics_chatgpt = cf.compute_evaluation_metrics(df_chatgpt_pp,"clean_pred_from_ocr")
cf.save_evaluation_metrics(f"ChatGPT_OCR-to-text",metrics_chatgpt,cfc.metrics_file_path)

#### Check the truth

In [None]:
df_chatgpt_truth = create_transcriptions_OCR(df_test,"formula","pred_from_ocr")
df_chatgpt_pp_truth = post_processing_multi_predictions_OCR(df_chatgpt_truth,"pred_from_ocr","clean_pred_from_ocr")
metrics_chatgpt_truth = cf.compute_evaluation_metrics(df_chatgpt_pp_truth,"clean_pred_from_ocr")
cf.save_evaluation_metrics(f"ChatGPT_OCR-to-text-truth",metrics_chatgpt_truth,cfc.metrics_file_path)

### GPT-3.5 Turbo

In [74]:
import openai

def get_transcription_gptx(formula,model): #model = "gpt-4", "gpt-3.5-turbo"


  openai.api_key = "xxx"
  response = openai.ChatCompletion.create(
  model=model,
  messages=[
        {"role": "user", "content": "Translate this LaTeX expression to English:"+ formula}],
  temperature = 0.2,
  )
  transcription = response.choices[0].message.content
  return transcription

In [75]:
def generate_transcriptions_with_gptx(df,model,input_col, output_col):
  model = model
  in_col = input_col
  out_col = output_col
  openai.api_key = "xxx"
  df = df.copy()
  y_preds = []

  for index, row in df.iterrows():
    formula = row[in_col]
    transcription = get_transcription_gptx(formula,model)
    y_preds.append(transcription)

  y_preds = np.array(y_preds)
  df[out_col] = y_preds
  return df

In [None]:
df_gpt35t = generate_transcriptions_with_gptx(df_input_nlg,"gpt-3.5-turbo","input_nlg","pred_from_ocr")
df_gpt35t_pp = post_processing_multi_predictions_OCR(df_gpt35t,"pred_from_ocr","clean_pred_from_ocr")
df_gpt35t_pp.to_csv(f"/content/drive/MyDrive/Experimente/PIPELINE/VAR1_LaTeX-OCR_NLG/GPT-35T_OCR-to-text_prediction.csv")
metrics_gpt35t = cf.compute_evaluation_metrics(df_gpt35t_pp,"clean_pred_from_ocr")
cf.save_evaluation_metrics(f"GPT-35T_OCR-to-text",metrics_gpt35t,cfc.metrics_file_path)

#### Checking the truth

In [None]:
df_gpt35t_truth = generate_transcriptions_with_gptx(df_test,"gpt-3.5-turbo","formula","pred_from_ocr")
df_gpt35t_pp_truth = post_processing_multi_predictions_OCR(df_gpt35t_truth,"pred_from_ocr","clean_pred_from_ocr")
metrics_gpt35t_truth = cf.compute_evaluation_metrics(df_gpt35t_pp_truth,"clean_pred_from_ocr")
cf.save_evaluation_metrics(f"GPT-35T_OCR-to-text-truth",metrics_gpt35t_truth,cfc.metrics_file_path)

### GPT-4

In [None]:
df_gpt4 = generate_transcriptions_with_gptx(df_input_nlg,"gpt-4","input_nlg","pred_from_ocr")
df_gpt4_pp = post_processing_multi_predictions_OCR(df_gpt4,"pred_from_ocr","clean_pred_from_ocr")
df_gpt4_pp.to_csv(f"/content/drive/MyDrive/Experimente/PIPELINE/VAR1_LaTeX-OCR_NLG/GPT-4_OCR-to-text_prediction.csv")
metrics_gpt4 = cf.compute_evaluation_metrics(df_gpt4_pp,"clean_pred_from_ocr")
cf.save_evaluation_metrics(f"GPT-4_OCR-to-text",metrics_gpt4,cfc.metrics_file_path)

#### Checking the truth

In [None]:
df_gpt4_truth = generate_transcriptions_with_gptx(df_test,"gpt-4","formula","pred_from_ocr")
df_gpt4_pp_truth = post_processing_multi_predictions_OCR(df_gpt4_truth,"pred_from_ocr","clean_pred_from_ocr")
metrics_gpt4_truth = cf.compute_evaluation_metrics(df_gpt4_pp_truth,"clean_pred_from_ocr")
cf.save_evaluation_metrics(f"GPT-4_OCR-to-text-truth",metrics_gpt4_truth,cfc.metrics_file_path)

### GPT-2 Medium

In [None]:
from transformers import AutoModelWithLMHead

gpt2_checkpoint = "gpt2-medium"
gpt2_model_dir = "/content/drive/MyDrive/models/NLG_models/GPT-2-Medium-latex-to-text-hypertuned"
gpt2_model_name = "GPT2-Medium_OCR_to_text"

gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_checkpoint)
gpt2_model = AutoModelWithLMHead.from_pretrained(gpt2_model_dir)

In [None]:
df_preds_gpt2 = generate_GPT_predictions(df_input_nlg, gpt2_model, gpt2_tokenizer,"input_nlg","pred_from_ocr","generate spoken text of: ")
df_preds_gpt2_pp = post_processing_multi_predictions_OCR(df_preds_gpt2,"pred_from_ocr","clean_pred_from_ocr")
df_preds_gpt2_pp.to_csv(f"/content/drive/MyDrive/Experimente/PIPELINE/VAR1_LaTeX-OCR_NLG/{gpt2_model_name}_prediction.csv")
metrics_gpt2 = cf.compute_evaluation_metrics(df_preds_gpt2_pp,"clean_pred_from_ocr")
cf.save_evaluation_metrics(gpt2_model_name,metrics_gpt2,cfc.metrics_file_path)

### Summary

In [101]:
df_nlg1  = pd.read_json("/content/drive/MyDrive/Experimente/PIPELINE/VAR1_LaTeX-OCR_NLG/metrics_ocr_nlg.json")
df_nlg1 = df_nlg1.transpose()
df_nlg1 = df_nlg1[["BLEU","TER","TER-ACC","ROUGE-1","ROUGE-2","ROUGE-L"]]
df_nlg1["BLEU"] = pd.to_numeric(df_nlg1["BLEU"])
df_nlg1["TER"] = pd.to_numeric(df_nlg1["TER"])
df_nlg1["TER-ACC"] = pd.to_numeric(df_nlg1["TER-ACC"])
df_nlg1["ROUGE-1"] = pd.to_numeric(df_nlg1["ROUGE-1"])
df_nlg1["ROUGE-2"] = pd.to_numeric(df_nlg1["ROUGE-2"])
df_nlg1["ROUGE-L"] = pd.to_numeric(df_nlg1["ROUGE-L"])
df_nlg1 = df_nlg1[["BLEU", "TER", "TER-ACC", "ROUGE-1","ROUGE-2","ROUGE-L"]].sort_values(by="BLEU", ascending=False)
formatted_df = df_nlg1.style.format({"BLEU": "{:.2f}", "TER": "{:.2f}", "TER-ACC": "{:.2f}", "ROUGE-1": "{:.2f}", "ROUGE-2": "{:.2f}","ROUGE-L": "{:.2f}"})

formatted_df.set_table_styles([
    {'selector': 'tr', 'props': [('text-align', 'left')]},
    {'selector': 'td', 'props': [('text-align', 'right')]}
])

display(formatted_df)

Unnamed: 0,BLEU,TER,TER-ACC,ROUGE-1,ROUGE-2,ROUGE-L
LatexOCR,77.48,23.45,76.55,86.14,76.45,86.05
Bard-Base_OCR_to_text,59.14,36.14,63.86,74.99,55.45,72.49
T5-Base_OCR_to_text,58.27,36.91,63.09,74.11,54.59,71.08
FLAN-T5-Base_OCR_to_text,56.54,38.1,61.9,73.73,54.41,70.31
ChatGPT_OCR-to-text,55.75,44.4,55.6,75.17,59.26,72.48
GPT-4_OCR-to-text,52.48,41.04,58.96,75.22,55.49,72.38
GPT-35T_OCR-to-text,39.51,71.51,28.49,66.5,46.6,62.94
GPT2-Medium_OCR_to_text,38.74,64.93,35.07,53.45,36.77,50.52


In [None]:
df_nlg1_latex = df_nlg1.to_latex(caption='Variante 1 - Experiment Pipeline')
print(df_nlg1_latex)