In [6]:
import os
import numpy as np
from tqdm import tqdm
from pdf2image import convert_from_path
import cv2
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import gc
from random import shuffle
import pickle

In [4]:



# One-hot encode labels
def encode_label(label):
    return to_categorical(TARGET_LABELS.index(label), num_classes=len(TARGET_LABELS))

# Function to extract label from filename
def extract_label_from_filename(filename):
    parts = filename.split('-')
    if len(parts) == 2 and parts[1].endswith('.pdf'):
        label = parts[1].replace('.pdf', '')
        if label in TARGET_LABELS :
            return label 
    return None

# Function to extract images from PDFs
def extract_images_from_path(pdf_path):
    images = convert_from_path(pdf_path=pdf_path, grayscale=True)
    return images  # Assuming one page per PDF

# Convert PIL image to numpy array
def pil_images_to_numpy(images):
    numpy_images = []
    for img in images:
        # img = img.convert('RGB')
        numpy_images.append(np.array(img))
    return numpy_images

# Preprocess images (grayscale and resize)
def preprocess_images(images, size):
    processed_images = []
    for image in images:
        # gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        resized_image = cv2.resize(image, size)
        norm_image = resized_image / 255.0
        processed_images.append(norm_image)
    return np.array(processed_images)

# Function to create training data
def create_training_data(base_dir, img_size):
    training_data = []
    for root, _, files in tqdm(os.walk(base_dir)):
        for file in files:
            
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                
                label = extract_label_from_filename(file)
                if label is None:
                    continue
                print(pdf_path)
                images = extract_images_from_path(pdf_path)
                numpy_images = pil_images_to_numpy(images)
                preprocessed_images = preprocess_images(numpy_images, img_size)
                one_hot_label = encode_label(label)
                for processed_image in preprocessed_images:
                    training_data.append([processed_image,one_hot_label])
    shuffle(training_data)
    return training_data



In [10]:
# Model parameters
IMG_SIZE = (224, 317)
BASE_DIR = r"D:\AIL transfer\Anyer\2025\scan now 5"
MODEL_NAME = 'AIL_detect_model.keras'
# Define target labels
TARGET_LABELS = ["PK", "BA", "PDL", "SIP", "SPJBTL", "PRNYT", "FOTO", "TOKEN", "PMHN", "KDBOOK", "KK", "KTPNPWP" ]

# Create training data
# training_data = create_training_data(BASE_DIR, IMG_SIZE)
# training_data.extend(create_training_data(BASE_DIR, IMG_SIZE))
print(len(training_data))

7746


In [9]:


with open("datajan2025.pkl", "wb") as file:
    pickle.dump(training_data, file)

In [5]:
with open("datajan2025.pkl", "rb") as file:
    training_data = pickle.load(file)

NameError: name 'pickle' is not defined

In [10]:
model.save(MODEL_NAME)

In [11]:


# Custom callback
class CustomEarlyStopping(tf.keras.callbacks.Callback):
    def __init__(self, acc_threshold=0.98, loss_threshold=0.08):
        super(CustomEarlyStopping, self).__init__()
        self.acc_threshold = acc_threshold
        self.loss_threshold = loss_threshold

    def on_epoch_end(self, epoch, logs=None):
        acc = logs.get("val_accuracy")
        loss = logs.get("val_loss")
        if acc is not None and loss is not None:
            if acc >= self.acc_threshold and loss <= self.loss_threshold:
                print(f"\n🚨 Early stopping at epoch {epoch + 1}: val_accuracy = {acc:.4f}, val_loss = {loss:.4f}")
                self.model.stop_training = True



# # Separate features (X) and labels (Y)
X = np.array([item[0] for item in training_data]).reshape(-1, IMG_SIZE[1], IMG_SIZE[0], 1)
Y = np.array([item[1] for item in training_data])

# Define the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_SIZE[1], IMG_SIZE[0], 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(len(TARGET_LABELS), activation='softmax')  # Number of classes = length of TARGET_LABELS
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=20, validation_split= 0.2, batch_size=32, callbacks=[CustomEarlyStopping()] )

# # Train the model
# model.fit(X, Y, epochs=5, validation_split=0.1, batch_size=32)


# Save the model
model.save(MODEL_NAME)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
🚨 Early stopping at epoch 5: val_accuracy = 0.9890, val_loss = 0.0761


In [None]:
# # Separate features (X) and labels (Y)
X = np.array([item[0] for item in training_data]).reshape(-1, IMG_SIZE[1], IMG_SIZE[0], 1)
Y = np.array([item[1] for item in training_data])

# Define the CNN model
model = load_model(MODEL_NAME)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=5, validation_split= 0.2, batch_size=32 )