In [1]:
!pip install -q git+https://github.com/huggingface/transformers.git

In [2]:
!pip install -q datasets jiwer

In [3]:
import pandas as pd

df = pd.read_fwf('/kaggle/input/fatima/output.txt', header=None)
df.rename(columns={0: "file_name", 1: "text"}, inplace=True)
df.head()

Unnamed: 0,file_name,text
0,a01-000u-00.png,A MOVE to stop Mr. Gaitskell from
1,a01-000u-01.png,nominating any more Labour life Peers
2,a01-000u-02.png,is to be made at a meeting of Labour
3,a01-000u-03.png,M Ps tomorrow . Mr. Michael Foot has
4,a01-000u-04.png,put down a resolution on the subject


In [4]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]
        # some file names end with jp instead of jpg, the two lines below fix this
        if file_name.endswith('jp'):
          file_name = file_name + 'g'
        # prepare image (i.e. resize + normalize)
        image = Image.open(self.root_dir + file_name).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text,
                                          padding="max_length",
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding


In [5]:

from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
test_dataset = IAMDataset(root_dir='/kaggle/input/iam-lines/IAM/image/',
                           df=df,
                           processor=processor)

preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



In [6]:
from torch.utils.data import DataLoader

test_dataloader = DataLoader(test_dataset, batch_size=8)

In [7]:
batch = next(iter(test_dataloader))

In [8]:
for k,v in batch.items():
  print(k, v.shape)

pixel_values torch.Size([8, 3, 384, 384])
labels torch.Size([8, 128])


In [9]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")

In [10]:
labels = batch["labels"]
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.batch_decode(labels, skip_special_tokens=True)
label_str

['A MOVE to stop Mr. Gaitskell from',
 'nominating any more Labour life Peers',
 'is to be made at a meeting of Labour',
 'M Ps tomorrow. Mr. Michael Foot has',
 'put down a resolution on the subject',
 'and he is to be backed by Mr. Will',
 'Griffiths, M P for Manchester Exchange.',
 'A MOVE to stop Mr. Gaitskell from nominating']

In [11]:
from transformers import VisionEncoderDecoderModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")
model.to(device)

config.json:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.23G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-large-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-23): 24 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=False)
              (key): Linear(in_features=1024, out_features=1024, bias=False)
              (value): Linear(in_features=1024, out_features=1024, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Line

In [13]:
from datasets import load_metric

cer = load_metric("cer")
# , trust_remote_code=True


  cer = load_metric("cer")


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

The repository for cer contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/cer.

You can avoid this prompt in future by passing the argument `trust_remote_code=True`.



Do you wish to run the custom code? [y/N] y


In [None]:
from tqdm.notebook import tqdm

print("Running evaluation...")

for batch in tqdm(test_dataloader):
    # predict using generate
    pixel_values = batch["pixel_values"].to(device)
    outputs = model.generate(pixel_values)

    # decode
    pred_str = processor.batch_decode(outputs, skip_special_tokens=True)
    labels = batch["labels"]
    labels[labels == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels, skip_special_tokens=True)

    # add batch to metric
    cer.add_batch(predictions=pred_str, references=label_str)

final_score = cer.compute()

Running evaluation...


  0%|          | 0/1670 [00:00<?, ?it/s]




In [None]:
print("Character error rate on test set:", final_score)

Character error rate on test set: 0.020726050134059665


In [12]:
from datasets import load_metric

wer = load_metric("wer")

  wer = load_metric("wer")


Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

The repository for wer contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wer.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


In [13]:
from tqdm.notebook import tqdm

print("Running evaluation...")

all_predictions = []
all_labels = []

for batch in tqdm(test_dataloader):
    # predict using generate
    pixel_values = batch["pixel_values"].to(device)
    outputs = model.generate(pixel_values)

    # decode
    pred_str = processor.batch_decode(outputs, skip_special_tokens=True)
    labels = batch["labels"]
    labels[labels == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels, skip_special_tokens=True)

    # add batch to metric
    wer.add_batch(predictions=pred_str, references=label_str)

    # Store predictions and labels
    all_predictions.extend(pred_str)
    all_labels.extend(label_str)



final_score = wer.compute()
print(f"WER:{final_score}")
import sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


print(precision_score(all_labels, all_predictions, average='macro'))
print(recall_score(all_labels, all_predictions, average='macro'))
print(f1_score(all_labels, all_predictions, average='macro'))
print(accuracy_score(all_labels, all_predictions))

Running evaluation...


  0%|          | 0/1670 [00:00<?, ?it/s]



WER:0.05723846767446838


  _warn_prf(average, modifier, msg_start, len(result))


0.5871844178400843


  _warn_prf(average, modifier, msg_start, len(result))


0.5857130057117651
0.586205891028793
0.7467235827154947


In [14]:
import pandas as pd

# Assuming you have the lists or arrays for file names, predictions, and labels
file_names = df['file_name']  # Assuming df['file_name'] has the file names


# Create a dictionary to hold the data
data = {
    'file_name': file_names,
    'prediction': all_predictions,
    'actual_label': all_labels
}

# Convert the dictionary to a DataFrame
df_results = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df_results.to_csv('/kaggle/working/predictions_vs_labels.csv', index=False)

print("CSV file saved successfully!")

CSV file saved successfully!


In [14]:
from tqdm.notebook import tqdm

print("Running evaluation...")

all_predictions = []
all_labels = []

for batch in tqdm(test_dataloader):
    # predict using generate
    pixel_values = batch["pixel_values"].to(device)
    outputs = model.generate(pixel_values)

    # decode
    pred_str = processor.batch_decode(outputs, skip_special_tokens=True)
    labels = batch["labels"]
    labels[labels == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels, skip_special_tokens=True)

    # add batch to metric
    cer.add_batch(predictions=pred_str, references=label_str)

    # Store predictions and labels
    all_predictions.extend(pred_str)
    all_labels.extend(label_str)

# Calculate accuracy outside the loop
correct_predictions = sum([p == l for p, l in zip(all_predictions, all_labels)])
accuracy = correct_predictions / len(all_predictions)

final_score = cer.compute()

print(f"CER: {final_score}")
print(f"Accuracy: {accuracy}")


Running evaluation...


  0%|          | 0/1670 [00:00<?, ?it/s]




CER: 0.020726050134059665

Accuracy: 0.7467235827154947


In [None]:
import matplotlib.pyplot as plt
from PIL import Image

# Iterate through the predictions, labels, and filenames
for i in range(len(all_predictions)):
    # Create a new figure for each plot
    plt.figure(figsize=(10, 5))

    # Load the image from the file name (assuming file_name contains the full path or a relative path)
    image = Image.open("/kaggle/input/iam-lines/IAM/image/"+f"{df['file_name'].iloc[i]}").convert("RGB")


    plt.imshow(image)
    plt.axis('off')  # Hide the axis
    plt.title(f"Prediction:{all_predictions[i]}     Actual:{all_labels[i]}")


    # Display the plot
    plt.show()

Results of the above cell are present in predictions_vs_labels csv file.

In [None]:

all_predictions = []
all_labels = []
# Store predictions and labels
all_predictions.extend(pred_str)
all_labels.extend(label_str)

# Calculate accuracy outside the loop
correct_predictions = sum([p == l for p, l in zip(all_predictions, all_labels)])
accuracy = correct_predictions / len(all_predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [None]:
print(all_predictions)

['that girl who said hullo to him in the garden?']


In [None]:
print(all_labels)

['that girl who said hullo to him in the garden?']
