In [11]:
import os
import pandas as pd

In [12]:
dataframe = pd.DataFrame()

# reading the labels file
file_path = "labels/lines.txt"
with open(file_path, 'r') as labels:
    file = labels.readlines()
    lineID = [line.split()[0] for line in file]
    graylevel = [line.split()[2] for line in file]
    transcription = [('').join(line.split()[8:]).replace('|',' ') for line in file]
    dataframe['line_id'] = lineID
    dataframe['transcription'] = transcription
    dataframe['graylevel'] = graylevel


In [13]:
dataframe.head(10)

Unnamed: 0,line_id,transcription,graylevel
0,a01-000u-00,A MOVE to stop Mr. Gaitskell from,154
1,a01-000u-01,nominating any more Labour life Peers,156
2,a01-000u-02,is to be made at a meeting of Labour,157
3,a01-000u-03,MPs tomorrow . Mr. Michael Foot has,156
4,a01-000u-04,put down a resolution on the subject,157
5,a01-000u-05,and he is to be backed by Mr. Will,156
6,a01-000u-06,"Griffiths , MP for Manchester Exchange .",159
7,a01-000x-00,A MOVE to stop Mr. Gaitskell from nominating,182
8,a01-000x-01,any more Labour life Peers is to be made at a,181
9,a01-000x-02,meeting of Labour 0MPs tomorrow . Mr. Michael,181


In [14]:
directory = "data"
directory_list = os.listdir(directory)

path = []
path_dataframe = pd.DataFrame()

for subdirectory in directory_list:
    subdirectory_list = os.listdir(f"{directory}/{subdirectory}")
    for sub_subdirectory in subdirectory_list:
        sub_subdirectory_list = os.listdir(f"{directory}/{subdirectory}/{sub_subdirectory}")
        for image in sub_subdirectory_list:
            image_path = f'{directory}/{subdirectory}/{sub_subdirectory}/{image}'
            path.append(image_path)


path_dataframe['path'] = path
path_dataframe.head()


Unnamed: 0,path
0,data/g02/g02-062/g02-062-04.png
1,data/g02/g02-062/g02-062-07.png
2,data/g02/g02-062/g02-062-08.png
3,data/g02/g02-062/g02-062-09.png
4,data/g02/g02-062/g02-062-05.png


In [15]:
# sort the path dataframe in alphabetical order

path_dataframe.sort_values(by=['path'], inplace=True)
path_dataframe.reset_index(drop=True, inplace=True)
path_dataframe.head()

Unnamed: 0,path
0,data/a01/a01-000u/a01-000u-00.png
1,data/a01/a01-000u/a01-000u-01.png
2,data/a01/a01-000u/a01-000u-02.png
3,data/a01/a01-000u/a01-000u-03.png
4,data/a01/a01-000u/a01-000u-04.png


In [16]:
# merge the two dataframes

dataframe = pd.concat([dataframe, path_dataframe], axis=1, ignore_index=False)
dataframe.tail(10)

Unnamed: 0,line_id,transcription,graylevel,path
13343,r06-137-06,"absolutely beastly , and I can't bear to think",180,data/r06/r06-137/r06-137-06.png
13344,r06-137-07,about it . ' And Philip said : ' But we 've got,185,data/r06/r06-137/r06-137-07.png
13345,r06-137-08,"to think about it , don't you see , because",182,data/r06/r06-137/r06-137-08.png
13346,r06-137-09,"if we don't it 'll just go on and on , don't",182,data/r06/r06-137/r06-137-09.png
13347,r06-137-10,you see ? ',184,data/r06/r06-137/r06-137-10.png
13348,r06-143-00,In the train going back to London I sat,186,data/r06/r06-143/r06-143-00.png
13349,r06-143-01,beside # Catherine . She had the stories open,185,data/r06/r06-143/r06-143-01.png
13350,r06-143-02,"in front of her , but she said : ' Philip 's a...",186,data/r06/r06-143/r06-143-02.png
13351,r06-143-03,I wish I went to that school . Did you notice,184,data/r06/r06-143/r06-143-03.png
13352,r06-143-04,that girl who said hullo to him in the garden ?,186,data/r06/r06-143/r06-143-04.png


In [17]:
# drop the line id column

dataframe.drop(['line_id'], axis=1, inplace=True)
dataframe.head()

Unnamed: 0,transcription,graylevel,path
0,A MOVE to stop Mr. Gaitskell from,154,data/a01/a01-000u/a01-000u-00.png
1,nominating any more Labour life Peers,156,data/a01/a01-000u/a01-000u-01.png
2,is to be made at a meeting of Labour,157,data/a01/a01-000u/a01-000u-02.png
3,MPs tomorrow . Mr. Michael Foot has,156,data/a01/a01-000u/a01-000u-03.png
4,put down a resolution on the subject,157,data/a01/a01-000u/a01-000u-04.png


In [18]:
# check for null values

dataframe.isnull().sum()

transcription    0
graylevel        0
path             0
dtype: int64

In [19]:
# split the dataframe into train, validation and test

from sklearn.model_selection import train_test_split

train, test = train_test_split(dataframe, test_size=0.2, random_state=42)
train, validation = train_test_split(train, test_size=0.2, random_state=42)

train.reset_index(drop=True, inplace=True)
validation.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

train.shape, validation.shape, test.shape

((8545, 3), (2137, 3), (2671, 3))

In [21]:
import cv2

def preprocess_image(image_path, graylevel):

    """
    This function takes the image path and graylevel as input and returns the preprocessed image
    Uses the graylevel to binarize the image
    """

    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = cv2.threshold(image, int(graylevel), 255, cv2.THRESH_BINARY)[1]
    image = cv2.resize(image, (128, 32))
    return image

In [29]:
# preprocess the train, validation and test dataframes

def preprocess_dataframe(dataframe):
    
        """
        This function takes the dataframe as input and returns the preprocessed dataframe
        """
    
        dataframe['image'] = dataframe.apply(lambda row: preprocess_image(row['path'], row['graylevel']), axis=1)
        dataframe.drop(['path', 'graylevel'], axis=1, inplace=True)
        return dataframe

In [30]:
# convert graylevel to int

train['graylevel'] = train['graylevel'].astype(int)

In [31]:
# preprocess the train, validation and test dataframes	

train = preprocess_dataframe(train)
validation = preprocess_dataframe(validation)
test = preprocess_dataframe(test)

In [35]:
# create a keras functional api model with cnn and rnn layers

from keras.models import Model
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout, BatchNormalization, Reshape, Lambda, Bidirectional, LSTM


AttributeError: partially initialized module 'keras.src' has no attribute 'utils' (most likely due to a circular import)

In [None]:
    
# Define input shape
input_shape = (height, width, 1)  # Assuming images are grayscale

# Build the CRNN model using the Functional API
input_layer = Input(shape=input_shape, name='input_layer')
conv1 = Conv2D(32, (3, 3), activation='relu')(input_layer)
maxpool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = Conv2D(64, (3, 3), activation='relu')(maxpool1)
maxpool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
conv3 = Conv2D(128, (3, 3), activation='relu')(maxpool2)
maxpool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
flatten = Flatten()(maxpool3)

# CNN part
cnn = Model(inputs=input_layer, outputs=flatten, name='cnn_model')

# RNN part
sequence_input = Input(shape=(max_sequence_length,), dtype='int32', name='sequence_input')
embedded_sequences = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length)(sequence_input)
lstm = Bidirectional(LSTM(64))(embedded_sequences)

# Combine CNN and RNN
combined = concatenate([cnn.output, lstm])
dense1 = Dense(256, activation='relu')(combined)
batch_norm = BatchNormalization()(dense1)
output_layer = Dense(num_classes, activation='softmax')(batch_norm)

# Create the final model
model = Model(inputs=[cnn.input, sequence_input], outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([train_data['image'].values, train_data['encoded_text'].values], train_data['encoded_text'].values, epochs=10, batch_size=32, validation_data=([val_data['image'].values, val_data['encoded_text'].values], val_data['encoded_text'].values))

# Evaluate the model on the test set
loss, accuracy = model.evaluate([test_data['image'].values, test_data['encoded_text'].values], test_data['encoded_text'].values)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')