In [1]:
import sys
import os
import torch
import torch.nn as nn
import torch.optim as optim
from datetime import datetime

sys.path.append(os.path.abspath("D:\\burtm\\Visual_studio_code\\PD_related_projects"))

from utils.model_utils import get_model, get_trainable_layers
from utils.data_loading import get_dataloaders
from utils.utils_transforms import get_transform  
from utils.training_utils import fine_tune_last_n_layers, train_model, get_criterion, get_optimizer, get_scheduler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import accelerate
import peft
import bitsandbytes

In [3]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, PreTrainedModel, PretrainedConfig
from PIL import Image
import requests

processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-stage1',use_fast=False)

In [4]:
#model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-stage1')

Config of the encoder: <class 'transformers.models.deit.modeling_deit.DeiTModel'> is overwritten by shared encoder config: DeiTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 384,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "model_type": "deit",
  "num_attention_heads": 6,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.48.2"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 384,
  "d_model": 256,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 1024,
  "decode

In [5]:
model=model.encoder

In [9]:
print(model)

DeiTModel(
  (embeddings): DeiTEmbeddings(
    (patch_embeddings): DeiTPatchEmbeddings(
      (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): DeiTEncoder(
    (layer): ModuleList(
      (0-11): 12 x DeiTLayer(
        (attention): DeiTSdpaAttention(
          (attention): DeiTSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): DeiTSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): DeiTIntermediate(
          (dense): Linear(in_features=384, out_features=1536, bias=True)
          (intermediate_act_fn)

In [10]:
model = model_utils.TruncatedDeiT(model)


In [11]:
print(model)

TruncatedDeiT(
  (embeddings): DeiTEmbeddings(
    (patch_embeddings): DeiTPatchEmbeddings(
      (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ModuleList(
    (0-9): 10 x DeiTLayer(
      (attention): DeiTSdpaAttention(
        (attention): DeiTSdpaSelfAttention(
          (query): Linear(in_features=384, out_features=384, bias=True)
          (key): Linear(in_features=384, out_features=384, bias=True)
          (value): Linear(in_features=384, out_features=384, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (output): DeiTSelfOutput(
          (dense): Linear(in_features=384, out_features=384, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (intermediate): DeiTIntermediate(
        (dense): Linear(in_features=384, out_features=1536, bias=True)
        (intermediate_act_fn): GELUActivation()
      )
      (output): DeiTOutput(


In [None]:
# Initialize a 384x384 input (batch size of 1, 3 color channels, 384x384 image)
input_tensor = torch.randn(1, 3, 384, 384)

# Perform inference with the model
output = model(input_tensor)

# Print the output dimension
print("Output dimension:", output.shape)
#print(output)

Output dimension: torch.Size([1, 578, 384])
tensor([[[-0.5542, -0.0727, -3.7304,  ...,  0.2753, -4.6295, -1.5863],
         [ 2.3161, -3.4154, -5.5582,  ...,  2.1043, -5.9251, -0.5948],
         [ 0.7380, -0.9215, -5.0391,  ...,  1.7634, -1.4387,  2.2261],
         ...,
         [ 1.5165, -1.5304,  0.0706,  ..., -1.8089, -3.3499, -0.4321],
         [-4.1813,  1.7628, -5.0856,  ...,  0.9802, -1.0009,  2.6266],
         [-1.3333,  1.1212, -4.7238,  ..., -1.5523,  0.1651,  0.9828]]],
       device='cuda:0', grad_fn=<AddBackward0>)


# standard

how much time to fine tune the whole model? 60s per iteration -> 1h per epoch

In [13]:
batch_size=32
N_max=282
input_filename='icdar_train_df_cc_5patches_perName.csv'
transform=processor
huggingface=True

In [14]:
# Define loss function and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device is: ",device)
model = model.to(device)

Device is:  cuda


In [15]:
import pandas as pd
source_path="D:\\burtm\\Visual_studio_code\\PD_related_projects"
file_path=source_path+f"\\outputs\\preprocessed_data\\{input_filename}"
train_df = pd.read_csv(file_path)

In [16]:
import shutil
icdar_path="D:\\download\\PD project\\datasets\\ICDAR 2013 - Gender Identification Competition Dataset\\"
# Define the directory and file paths
directory_name = "extracted_representations_full"
save_path=icdar_path+directory_name
log_file_name = "log.txt"

# Create the directory if it doesn't exist
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Create a log file in the directory
log_file_path = os.path.join(save_path, log_file_name)
with open(log_file_path, "w") as log_file:
    log_file.write("Log data about the files in the folder:\n")
    log_file.write("Each key in the hdf5 file is the full output of the trocr small stage 1 model truncated at the 10th transformer block \n")
    log_file.write("The dataset corresponding to the data is the one reported below. The index of the file corresponds to the index in the df\n")

# Copy the file to the directory
source_file_path = file_path  # Assuming file_path is already defined
destination_file_path = os.path.join(save_path, os.path.basename(file_path))
shutil.copy(source_file_path, destination_file_path)

# Add the copied file information to the log file
with open(log_file_path, "a") as log_file:
    log_file.write(f"Copied file: {os.path.basename(file_path)}\n")



In [21]:
import h5py
import numpy as np

# Open the file in append mode
with h5py.File(save_path+"\\representations.h5", "a") as f:
    model.eval()
    for index,t in train_df.iterrows():
        image_file = t['file_name']
        x1 = t['x']
        y1 = t['y']
        x2 = t['x2']
        y2 = t['y2']
        image = Image.open(image_file).convert("RGB")
        patch = image.crop((x1, y1, x2, y2))
        if huggingface:
            # the transform is actually an huggingface processor in this case
            inputs = transform(images=patch, return_tensors="pt")
            # Remove batch dimension from inputs
            patch = inputs['pixel_values'].squeeze()
        elif transform:
            patch = transform(patch)
        patch = patch.to(device)
        output = model(patch.unsqueeze(0))
        #print(output)
        # Convert index to string key (e.g., "0001")
        key = f"{index:06d}"
        # Store with compression (optional)
        rep_np = output.squeeze(0).detach().cpu().numpy()
        f.create_dataset(key, data=rep_np, compression="gzip")
        if index % 100 == 0:
            print(f"Processed {index} images, out of {len(train_df)}")
#close the file
f.close()

Processed 0 images, out of 5640
Processed 100 images, out of 5640
Processed 200 images, out of 5640
Processed 300 images, out of 5640
Processed 400 images, out of 5640
Processed 500 images, out of 5640
Processed 600 images, out of 5640
Processed 700 images, out of 5640
Processed 800 images, out of 5640
Processed 900 images, out of 5640
Processed 1000 images, out of 5640
Processed 1100 images, out of 5640
Processed 1200 images, out of 5640
Processed 1300 images, out of 5640
Processed 1400 images, out of 5640
Processed 1500 images, out of 5640
Processed 1600 images, out of 5640
Processed 1700 images, out of 5640
Processed 1800 images, out of 5640
Processed 1900 images, out of 5640
Processed 2000 images, out of 5640
Processed 2100 images, out of 5640
Processed 2200 images, out of 5640
Processed 2300 images, out of 5640
Processed 2400 images, out of 5640
Processed 2500 images, out of 5640
Processed 2600 images, out of 5640
Processed 2700 images, out of 5640
Processed 2800 images, out of 56

In [24]:
with h5py.File(save_path+"\\representations.h5", "r") as f:
    rep_42 = f["000042"][:]  # Load representation for index 42
print(rep_42)


[[ 3.7826767  -0.99325603 -4.8960156  ... -0.38691512 -2.6435657
  -0.15363264]
 [ 3.6392565  -3.4912357  -6.4768376  ...  1.6145078  -5.1998186
  -0.57736003]
 [-2.5747104   0.26719785 -4.02846    ... -1.2227541  -4.754792
   1.9072549 ]
 ...
 [-2.774644    1.8033377  -5.8980827  ...  4.430866   -0.12699604
   2.834492  ]
 [-0.9595611  -1.8759494  -2.069882   ... -4.8516946  -1.341036
   3.092637  ]
 [-1.2976632   0.37103814 -4.6950593  ...  1.7340031  -1.1925094
   1.6214855 ]]


# easy access

In [8]:
def reload_modules():
    import importlib
    import utils.data_loading as data_loading
    import utils.visualization as visualization
    import utils.dataframes as dataframes
    import utils.utils_transforms as u_transforms
    import utils.training_utils as training_utils
    import utils.model_utils as model_utils
    

    importlib.reload(data_loading)
    importlib.reload(visualization)
    importlib.reload(dataframes)
    importlib.reload(u_transforms)
    importlib.reload(model_utils)
    importlib.reload(training_utils)

    return data_loading, visualization, dataframes, u_transforms, training_utils, model_utils
data_loading, visualization, dataframes, u_transforms, training_utils, model_utils = reload_modules()