In [33]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Reshape, Dense, LSTM, Bidirectional, BatchNormalization, Input, TimeDistributed, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import cv2
import pytesseract

# Ensure Tesseract OCR is correctly configured
pytesseract.pytesseract.tesseract_cmd = "C://Program Files//Tesseract-OCR//tesseract.exe"

In [34]:

# Paths
DATASET_DIR = "E:/Workspace/dataset/dataset/training/prescriptions"
LABELS_FILE = "E:/Workspace/dataset/dataset/training/prescriptions/_annotations.csv"

In [35]:
# Load dataset
labels_df = pd.read_csv(LABELS_FILE)
labels_df.dropna(inplace=True)  # Remove missing values

In [36]:
# OCR Text Extraction Function
def extract_text(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (128, 128))
    text = pytesseract.image_to_string(img, config='--psm 6')
    return text.strip()

In [37]:
# Prepare images and text labels
X, y = [], []
for _, row in labels_df.iterrows():
    image_path = os.path.join(DATASET_DIR, row["filename"])
    if os.path.exists(image_path):
        img = load_img(image_path, color_mode='grayscale', target_size=(128, 128))
        img = img_to_array(img) / 255.0
        X.append(img)
        y.append(extract_text(image_path))

X = np.array(X)

In [39]:

# Convert text labels to numerical values
tokenizer = Tokenizer(char_level=True, filters="")
tokenizer.fit_on_texts(y)
y_seq = tokenizer.texts_to_sequences(y)
max_length = max(len(seq) for seq in y_seq)
y_padded = tf.keras.preprocessing.sequence.pad_sequences(y_seq, maxlen=max_length, padding='post')

In [40]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_padded, test_size=0.2, random_state=42)

In [41]:
# Build CNN+LSTM Model
inputs = Input(shape=(128, 128, 1))

In [42]:
x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
x = MaxPooling2D((2, 2), padding='same')(x)
x = BatchNormalization()(x)

x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = BatchNormalization()(x)

x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = BatchNormalization()(x)

In [57]:
x = Flatten()(x)  # Flatten before LSTM
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)

# Ensure the reshaped shape maintains total element count
if 512 % max_length == 0:
    x = Reshape((max_length, 512 // max_length))(x)
else:
    x = Dense(max_length * 128, activation='relu')(x)  # Adjust dimensions
    x = Reshape((max_length, 128))(x)  # Fixed reshape


In [58]:
# Reshape for LSTM
x = Reshape((max_length, -1))(x)

# BiLSTM layers
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)

# Output layer
x = TimeDistributed(Dense(len(tokenizer.word_index) + 1, activation='softmax'))(x)
model = Model(inputs, x)

In [59]:
y_train = tf.cast(y_train, dtype=tf.int32)
y_test = tf.cast(y_test, dtype=tf.int32)


In [60]:
def ctc_loss(y_true, y_pred):
    y_true = tf.cast(y_true, dtype=tf.int32)
    batch_len = tf.shape(y_pred)[0]
    input_len = tf.fill([batch_len], tf.shape(y_pred)[1])
    label_len = tf.reduce_sum(tf.cast(tf.not_equal(y_true, 0), dtype=tf.int32), axis=1)

    return tf.nn.ctc_loss(
        labels=y_true,
        logits=y_pred,
        label_length=label_len,
        logit_length=input_len,
        logits_time_major=False,  # Ensure logits are batch-major
        blank_index=len(tokenizer.word_index)  # Adjust blank index
    )


In [61]:
# Compile Model
model.compile(optimizer=Adam(learning_rate=0.001), loss=ctc_loss, metrics=['accuracy'])


In [62]:
print(f"Model output shape: {model.output_shape}")  # Expected: (None, ?, vocab_size)
print(f"y_train shape: {y_train.shape}")  # Expected: (num_samples, ?, vocab_size)


Model output shape: (None, 48, 77)
y_train shape: (7267, 48)


In [63]:
# Training
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20, batch_size=16
)


Epoch 1/20
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 278ms/step - accuracy: 5.5779e-07 - loss: 160.8454 - val_accuracy: 0.0000e+00 - val_loss: 158.7991
Epoch 2/20
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 265ms/step - accuracy: 8.4708e-06 - loss: 157.9327 - val_accuracy: 0.0000e+00 - val_loss: 158.7981
Epoch 3/20
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 272ms/step - accuracy: 3.3045e-06 - loss: 158.0893 - val_accuracy: 0.0000e+00 - val_loss: 158.7977
Epoch 4/20
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 276ms/step - accuracy: 7.2929e-06 - loss: 157.8031 - val_accuracy: 0.0000e+00 - val_loss: 158.7976
Epoch 5/20
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 277ms/step - accuracy: 9.8342e-07 - loss: 157.7803 - val_accuracy: 0.0000e+00 - val_loss: 158.7975
Epoch 6/20
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 280ms/step - accuracy: 1.3203e-