In [53]:
# import tpu dlu
import os
import tensorflow as tf

use_tpu = True #@param {type:"boolean"}

if use_tpu:
    assert 'COLAB_TPU_ADDR' in os.environ, 'Missing TPU; did you request a TPU in Notebook Settings?'

if 'COLAB_TPU_ADDR' in os.environ:
    TPU_ADDRESS = 'grpc://{}'.format(os.environ['COLAB_TPU_ADDR'])
else:
    TPU_ADDRESS = ''

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_ADDRESS)
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))




All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


In [54]:
!apt-get install poppler-utils -y
!pip install pdf2image


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 9 not upgraded.


In [62]:
from pdf2image import convert_from_path
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.applications import VGG16
from keras.models import Sequential, Model
from keras.layers import GlobalAveragePooling2D, Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [56]:
# Define global variable
image_height = 224
image_width = 224

In [57]:
def preprocess_pdf_multipage(pdf_path):
    # Convert PDF to images
    images = convert_from_path(pdf_path, dpi=20)
    resized_images = []
    for image in images:
        resized_image = image.resize((image_width, image_height))
        resized_images.append(resized_image)
    return resized_images

def preprocess_pdf_singlepage(pdf_path):
    image = convert_from_path(pdf_path, dpi=20)[0]
    resized_image = image.resize((image_width, image_height))
    return resized_image

def predict_pdf(pdf_path, model):
    preprocessed_image = preprocess_pdf_singlepage(pdf_path)
    image_array = np.array(preprocessed_image).reshape(-1,image_width, image_height,3)
    prediction = model.predict(np.expand_dims(preprocessed_image, axis=0))
    predicted_label = np.argmax(prediction)
    return prediction

def predict_pdf_multipage(pdf_path, model):
    preprocessed_image = preprocess_pdf_multipage(pdf_path)
    image_array = np.array(preprocessed_image)

    page_result = []
    for img in image_array:
      prediction = model.predict(np.expand_dims(img, axis=0))
      page_result.append(np.argmax(prediction))
    return page_result

def predict_image(preprocessed_image, model):
    image_array = np.array(preprocessed_image).reshape(-1,image_width, image_height,3)
    prediction = model.predict(np.expand_dims(preprocessed_image, axis=0))
    predicted_label = np.argmax(prediction)
    return prediction

In [58]:
# read excel train/test pdf dataset
df = pd.read_excel(r'/content/drive/MyDrive/dataset/extract_dfp/dataset.xlsx',sheet_name=0)
pdf_paths = df.path.to_list()
labels = np.array(df.label.to_list())

# read excel test data
df = pd.read_excel(r'/content/drive/MyDrive/dataset/extract_dfp/dataset_test.xlsx',sheet_name=0)
test_pdf_paths = df.path.to_list()
test_labels = np.array(df.label.to_list())

In [59]:
# Transform train/val set

# set number of training samples is the same as the number of PDF files
num_samples = len(pdf_paths)
num_channels = 3

# Create zeros np array
images = np.zeros((num_samples, image_height, image_width, num_channels))

# change pdf to image
for i, pdf_path in enumerate(pdf_paths):
    resized_image = preprocess_pdf_singlepage(pdf_path)
    image_array = np.array(resized_image)
    images[i] = image_array

# Verify the shape of the train_images dataset
print(images.shape)


(468, 224, 224, 3)


In [79]:
# One Hot Encoding use scikit
oh_encoder = OneHotEncoder()
oh_label = oh_encoder.fit_transform(labels.reshape(-1,1))
one_hot_labels = np.array(tf.constant(oh_label.toarray()))

train_images, val_images, train_labels, val_labels = train_test_split(images, one_hot_labels, test_size=0.2, random_state=42)

# Verify the shapes of the training and validation sets
print("Train Images Shape:", train_images.shape)
print("Train Labels Shape:", train_labels.shape)
print("Validation Images Shape:", val_images.shape)
print("Validation Labels Shape:", val_labels.shape)

print("label type:", type(one_hot_labels))

Train Images Shape: (374, 224, 224, 3)
Train Labels Shape: (374, 3)
Validation Images Shape: (94, 224, 224, 3)
Validation Labels Shape: (94, 3)
label type: <class 'numpy.ndarray'>


In [80]:
# Transform test set

# set number of training samples is the same as the number of PDF files
test_num_samples = len(test_pdf_paths)
num_channels = 3

# Create zeros np array
test_images = np.zeros((test_num_samples, image_height, image_width, num_channels))

# change pdf to image
for i, pdf_path in enumerate(test_pdf_paths):
    resized_image = preprocess_pdf_singlepage(pdf_path)
    image_array = np.array(resized_image)
    test_images[i] = image_array

# Verify the shape of the train_images dataset
print(test_images.shape)


(22, 224, 224, 3)


In [81]:
# one-hot encoding on the encoded labels
test_oh_label = oh_encoder.transform(test_labels.reshape(-1,1))
test_oh_label_tf = np.array(tf.constant(test_oh_label.toarray()))

# Verify the shapes of the training and validation sets
print("Test Labels Shape:", test_oh_label_tf.shape)

Test Labels Shape: (22, 3)


In [85]:
# Load the pretrained VGG16 model (excluding the top fully-connected layers)
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(image_width, image_height, 3))

# Freeze the weights of the base model
base_model.trainable = False

# Add new classification layers on top of the base model
x = base_model.output
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.25)(x)

x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.25)(x)

x = GlobalAveragePooling2D()(x)

# Output layer
predictions = Dense(3, activation='softmax')(x)

# Create the final model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_images, train_labels, epochs=10, batch_size=32, validation_data=(val_images, val_labels))

# Evaluate the model
# test_loss, test_accuracy = model.evaluate(test_images, test_labels)
# print("Test Loss:", test_loss)
# print("Test Accuracy:", test_accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa28bf72ef0>

In [88]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_images, test_oh_label_tf)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 2.739954652497545e-05
Test Accuracy: 1.0


In [86]:
model.save(r'/content/drive/MyDrive/dataset/extract_dfp/model.h5')