In [6]:
import os
import numpy as np
from tqdm import tqdm
from pdf2image import convert_from_path
import cv2
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import gc
from random import shuffle

In [2]:

# Define target labels
TARGET_LABELS = ["PK", "BA", "PDL", "SIP", "SPJBTL", "OTHER"]

# One-hot encode labels
def encode_label(label):
    return to_categorical(TARGET_LABELS.index(label), num_classes=len(TARGET_LABELS))

# Function to extract label from filename
def extract_label_from_filename(filename):
    parts = filename.split('-')
    if len(parts) == 2 and parts[1].endswith('.pdf'):
        label = parts[1].replace('.pdf', '')
        return label if label in TARGET_LABELS else "OTHER"
    return None

# Function to extract images from PDFs
def extract_images_from_path(pdf_path):
    images = convert_from_path(pdf_path)
    return images  # Assuming one page per PDF

# Convert PIL image to numpy array
def pil_images_to_numpy(images):
    numpy_images = []
    for img in images:
        img = img.convert('RGB')
        numpy_images.append(np.array(img))
    return numpy_images

# Preprocess images (grayscale and resize)
def preprocess_images(images, size):
    processed_images = []
    for image in images:
        gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        
        resized_image = cv2.resize(gray_image, size)
        processed_images.append(resized_image)
    return np.array(processed_images)

# Function to create training data
def create_training_data(base_dir, img_size):
    training_data = []
    for root, _, files in tqdm(os.walk(base_dir)):
        for file in files:
            
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                
                label = extract_label_from_filename(file)
                if label is None:
                    continue
                print(pdf_path)
                images = extract_images_from_path(pdf_path)
                numpy_images = pil_images_to_numpy(images)
                preprocessed_images = preprocess_images(numpy_images, img_size)
                one_hot_label = encode_label(label)
                for processed_image in preprocessed_images:
                    training_data.append([processed_image,one_hot_label])
    shuffle(training_data)
    return training_data



In [None]:
# Model parameters
IMG_SIZE = (218, 218)
BASE_DIR = r"D:\AIL transfer\Anyer\2025\scan now"
MODEL_NAME = 'pdf_cnn_model.keras'

# Create training data
training_data = create_training_data(BASE_DIR, IMG_SIZE)
print(len(training_data))

In [4]:
import pickle

with open("data2025.pkl", "wb") as file:
    pickle.dump(training_data, file)

In [None]:
with open("data.pkl", "rb") as file:
    training_data = pickle.load(file)

In [None]:
# # Separate features (X) and labels (Y)
X = np.array([item[0] for item in training_data]).reshape(-1, IMG_SIZE[0], IMG_SIZE[1], 1)
Y = np.array([item[1] for item in training_data])

# Define the CNN model
model = load_model(MODEL_NAME)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=5, validation_split= 0.2, batch_size=32 )

In [10]:
model.save(MODEL_NAME)

In [None]:
# import numpy as np

# Custom batch generator
# def batch_generator_from_data(training_data, batch_size):
#     size = len(training_data)
#     indices = np.arange(size)
    
#     while True:
#         np.random.shuffle(indices)  # Shuffle the data
#         for start in range(0, size, batch_size):
#             end = min(start + batch_size, size)
#             batch_indices = indices[start:end]
            
#             batch_x = np.array([extract_images_from_path(training_data[i][0]) for i in batch_indices])  # Images
#             batch_y = np.array([training_data[i][1] for i in batch_indices])  # Labels
            
#             yield batch_x, batch_y

# Function to process files in batches
# def batch_generator_from_files(file_paths, batch_size, img_size):
#     size = len(file_paths)
    
#     # Create batches manually
#     for i in range(0, size, batch_size):
#         print(i)
#         batch_files = file_paths[i:i+batch_size]
#         batch_x = []
#         batch_y = []
        
#         # Process each file in the batch
#         for pdf_file in batch_files:
#             label = os.path.basename(pdf_file)
#             label = extract_label_from_filename(label)
#             if label is None:
#                 continue  # Skip if label is not found
#             label = encode_label(label)
            
#             # Extract and preprocess image on-the-fly
#             images = extract_images_from_path(pdf_file)
#             images = pil_images_to_numpy(images)
#             images = preprocess_images(images, img_size)
            
#             for image in images:
#                 batch_x.append(image)
#                 batch_y.append(label)

#                 # Yield after processing each image to keep memory usage low
#                 if len(batch_x) == batch_size:
#                     yield np.array(batch_x), np.array(batch_y)
#                     batch_x = []
#                     batch_y = []
        
#         # Yield any remaining images in the final partial batch
#         if batch_x:
#             yield np.array(batch_x), np.array(batch_y)
        
        # yield np.array(batch_x), np.array(batch_y)

# Assuming your training_data list exists
# BATCH_SIZE = 32  # Define your batch size
# TRAIN_BATCH_SIZE=32
# # train_generator = batch_generator_from_data(training_data_all, BATCH_SIZE)
# train_files, val_files= train_test_split(training_data_all, test_size=0.2, random_state=42)
# train_generator = batch_generator_from_files(train_files, TRAIN_BATCH_SIZE, IMG_SIZE)
# val_generator = batch_generator_from_files(val_files, TRAIN_BATCH_SIZE, IMG_SIZE)


# # Separate features (X) and labels (Y)
X = np.array([item[0] for item in training_data]).reshape(-1, IMG_SIZE[0], IMG_SIZE[1], 1)
Y = np.array([item[1] for item in training_data])

# Define the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_SIZE[0], IMG_SIZE[1], 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(len(TARGET_LABELS), activation='softmax')  # Number of classes = length of TARGET_LABELS
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, Y, epochs=20, validation_split= 0.2, batch_size=32 )

# # Train the model
# model.fit(X, Y, epochs=5, validation_split=0.1, batch_size=32)



# Shuffle the file paths (important for randomness during training)
# shuffle(pdf_files)

# Define batch size and image size
# BATCH_SIZE = 100  # Process data in batches of 10,000
# IMG_SIZE = (218, 218)  # Image size (width, height)
# TRAIN_BATCH_SIZE=20


# # Calculate total number of batches
# total_batches = len(training_data_all) // BATCH_SIZE

# # Train the model for each batch, 5 epochs per batch
# for batch_num in range(total_batches):
#     print(f"Training batch {batch_num + 1}/{total_batches}")
    
#     # Get the batch file paths
#     batch_files = training_data_all[batch_num * BATCH_SIZE:(batch_num + 1) * BATCH_SIZE]
#     print(len(batch_files))

#     train_files, val_files= train_test_split(batch_files, test_size=0.2, random_state=42)
    
#     # Create the batch generator for the current batch
#     train_generator = batch_generator_from_files(train_files, TRAIN_BATCH_SIZE, IMG_SIZE)
#     val_generator = batch_generator_from_files(val_files, TRAIN_BATCH_SIZE, IMG_SIZE)

#     print("data generated")

#     # Train on the current batch for 5 epochs
#     model.fit(train_generator, epochs=5, steps_per_epoch= 1, validation_data= val_generator, validation_steps= 1)
    
#     # Optionally save the model after each batch, if desired
#     # model.save(f"model_after_batch_{batch_num + 1}.keras")

#     # # Clear memory after training on this batch
#     # del batch_files, train_generator
#     # gc.collect()  # Call garbage collection to free memory

#     # # Optionally, you can print memory status to monitor usage:
#     # import psutil
#     # print(f"Memory usage after batch {batch_num + 1}: {psutil.virtual_memory().percent}%")

# Save the model
model.save(MODEL_NAME)