In [1]:
import os
import cv2
import json
import numpy as np
import threading
import multiprocessing

import tensorflow as tf
from keras.api._v2.keras.preprocessing.image import ImageDataGenerator
from keras.api._v2.keras.preprocessing.text import Tokenizer
from keras.api._v2.keras.preprocessing.sequence import pad_sequences

2024-04-15 12:36:42.206912: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-15 12:36:42.247572: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-15 12:36:42.247600: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-15 12:36:42.248880: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-15 12:36:42.256020: I tensorflow/core/platform/cpu_feature_guar

In [2]:
class load_floodnet():
    def __init__(self):
        self.resized_images_dict = multiprocessing.Manager().dict()
        self.lock = threading.Lock()
    
    def get_all_filenames(self, directory):
        """列出文件名"""
        filenames = []
        for root, dirs, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                filenames.append(file_path)
        return filenames

    def resize_and_store_images(self, image_paths):
        for path in image_paths:
            image = cv2.imread(path)
            resized_image = cv2.resize(image, (244, 244))
            _, file_name = os.path.split(path)
            with self.lock:
                self.resized_images_dict[file_name] = resized_image
    
    def process_images_parallel(self):
        image_paths = self.get_all_filenames("/work/floodnet_dataset/Images/Train_Image")
        num_cores = multiprocessing.cpu_count()
        # num_cores = 8
        chunk_size = len(image_paths) // num_cores
        chunks = [image_paths[i:i+chunk_size] for i in range(0, len(image_paths), chunk_size)]

        processes = []
        for i, chunk in enumerate(chunks):
            process = multiprocessing.Process(target=self.resize_and_store_images, args=(chunk,), name=f"Process-{i+1}")
            processes.append(process)
            process.start()

        for process in processes:
            process.join()
        return self.resized_images_dict
    
    def load_data(self):
        '''加载数据'''
        self.resized_images_dict = self.process_images_parallel()
        
        # 从json文件中读取问题和标签, 并将标签和数据对应 
        with open('/work/floodnet_dataset/Questions/Training Question.json', 'r') as f:
            data = json.load(f)

        image_id_question_list=[]

        for key, value in data.items():
            image_id = value["Image_ID"]
            if image_id in self.resized_images_dict:
                image = self.resized_images_dict[image_id]
            else:
                image = None
            image_id_question_list.append([image_id, value["Question"], value["Ground_Truth"], image])
        
        # 返回图片，问题和答案三个列表    
        return [item[3] for item in image_id_question_list], [item[1] for item in image_id_question_list], [item[2] for item in image_id_question_list]

In [3]:
def preprocess_data(images, questions, answers, num_answers):
    '''数据预处理'''
    # 图像预处理
    # image_generator = ImageDataGenerator(rescale=1./255)
    # images = np.stack(images, axis=0) # (图片数量,h,w,通道)
    # image_data = image_generator.flow(images, batch_size=len(images), shuffle=False)
    
    images = np.stack(images, axis=0) # (图片数量,h,w,通道)
    min_value = np.min(images)
    max_value = np.max(images)
    image_data = (images - min_value) / (max_value - min_value)
    
    # 问题预处理
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(questions)
    sequences = tokenizer.texts_to_sequences(questions)
    question_data = pad_sequences(sequences, maxlen=len(questions))
    
    # 答案预处理
    # 将布尔值转换为字符串
    answers_str = []
    for item in answers:
        answers_str.append(str(item))
    # 构建标签到整数的映射字典
    label_set = set(answers_str)
    label_map = {}
    for i, label in enumerate(label_set):
        label_map[label] = i
    # 将字符串标签转换为整数编码
    answers_encoded = []
    for label in answers_str:
        answers_encoded.append(label_map[label])

    answer_data = tf.keras.utils.to_categorical(answers_encoded, num_classes=num_answers)
    
    return tf.data.Dataset.from_tensor_slices((image_data, question_data, answer_data))

In [4]:
temp = load_floodnet()
images, questions, answers = temp.load_data()

dataset = preprocess_data(images, questions, answers, 100)

2024-04-15 12:36:59.000116: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 79091 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:b1:00.0, compute capability: 8.0
