### 1차 데이터 전처리

In [1]:
import os
import json
from PIL import Image
from tqdm import tqdm
import time

In [46]:
def crop_and_resize_and_save_face(json_path, file_dir_path, save_dir, label, cropped_shape = (256,256)):
    #target데이터
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)
    
    data_dict = {item["filename"]: item for item in data}

    #저장 될 폴더가 존재하지 않으면 생성
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    processed_count = 0
    error_count = 0

    for filename in tqdm(os.listdir(file_dir_path),desc=f"processing label {label}"):
        file_path = os.path.join(file_dir_path, filename)

        if filename in data_dict:
            item = data_dict[filename]

            try:
                with Image.open(file_path) as img:
                    face_coords = item["annot_A"]["boxes"]
                    left = face_coords["minX"]
                    right = face_coords["maxX"]
                    top = face_coords["minY"]
                    bottom = face_coords["maxY"]
                    #유효성 검사
                    if left >= 0 and top >= 0 and right <= img.width and bottom <= img.height and right > left and bottom > top:
                        cropped_img = img.crop((left, top, right, bottom))

                        if cropped_img.size != (0, 0):
                            resized_img = cropped_img.resize(cropped_shape)
                            temp_filename = f"{label}_{filename}"
                            resized_img.save(os.path.join(save_dir,temp_filename))
                            processed_count += 1
                            # print(f"{processed_count} - Successfully cropped and saved {temp_filename}")
                        else:
                            # print(f"Empty cropped image for {filename}")
                            error_count += 1
                    else:
                        # print(f"Invalid crop coordinates for {filename}")
                        error_count += 1
            except Exception as e:
                print(f"Error: {e}")
                error_count += 1
    
    return processed_count , error_count

In [47]:
classes = ["분노","슬픔","불안","상처","당황","기쁨","중립"]
processed_list = []
error_list = []
start = time.time()
for class_index,class_ in enumerate(classes):
    json_path = f"Training\[라벨]EMOIMG_{class_}_TRAIN\img_emotion_training_data({class_}).json"
    file_dir_path = f"Training\[원천]EMOIMG_{class_}_TRAIN_01"
    save_dir = "Training/Cropped_Images"
    (processed_count, error_count) = crop_and_resize_and_save_face(json_path, file_dir_path, save_dir,label = class_index,cropped_shape=(256,256))
    print(processed_count, error_count)
    processed_list.append(processed_count)
    error_list.append(error_count)
end = time.time()

print(f"걸린 시간: {round(end-start,2)}s")
for class_,processed_count,error_count in zip(classes,processed_list,error_list):
    print(class_,f"total_count = {processed_count}, error_count = {error_count}")

processing label 0: 100%|██████████| 16234/16234 [12:16<00:00, 22.04it/s] 


16117 117


processing label 1: 100%|██████████| 15874/15874 [12:10<00:00, 21.74it/s] 


15787 87


processing label 2: 100%|██████████| 16133/16133 [12:17<00:00, 21.87it/s] 


16046 87


processing label 3: 100%|██████████| 16023/16023 [12:15<00:00, 21.79it/s] 


15930 93


processing label 4:  23%|██▎       | 3712/16170 [02:57<09:03, 22.93it/s]

Error: broken data stream when reading image file


processing label 4: 100%|██████████| 16170/16170 [12:18<00:00, 21.89it/s] 


16081 89


processing label 5: 100%|██████████| 16072/16072 [12:19<00:00, 21.74it/s] 


15984 88


processing label 6:  54%|█████▎    | 8668/16197 [06:35<04:18, 29.11it/s]  

Error: broken data stream when reading image file


processing label 6:  57%|█████▋    | 9182/16197 [06:56<06:07, 19.11it/s]

Error: broken data stream when reading image file
Error: broken data stream when reading image file


processing label 6: 100%|██████████| 16197/16197 [12:05<00:00, 22.34it/s]


16074 123
걸린 시간: 5151.46s
분노 total_count = 16117, error_count = 117
슬픔 total_count = 15787, error_count = 87
불안 total_count = 16046, error_count = 87
상처 total_count = 15930, error_count = 93
당황 total_count = 16081, error_count = 89
기쁨 total_count = 15984, error_count = 88
중립 total_count = 16074, error_count = 123


In [9]:
import json
json_path = "데이터 전처리\원천 데이터\[라벨]EMOIMG_상처_TRAIN\img_emotion_training_data(상처).json"
with open(json_path, 'r') as json_file:
    data = json.load(json_file)
print(data[1])

{'filename': 'fea9f022fd842f975b69cb66061cd196c2801f54055e68bbed2e3de9868b1c79_남_20_상처_숙박 및 거주공간_20210122135731-010-036.jpeg', 'gender': '남', 'age': 20, 'isProf': '일반인', 'faceExp_uploader': '상처', 'bg_uploader': '숙박 및 거주공간', 'annot_A': {'boxes': {'maxX': 2474.23, 'maxY': 1744.4131951307343, 'minX': 1772.8025999999995, 'minY': 873.0751803107773}, 'faceExp': '중립', 'bg': '숙박 및 거주공간'}, 'annot_B': {'boxes': {'maxX': 2474.2299999999996, 'maxY': 1667.922966471545, 'minX': 1772.8025999999995, 'minY': 842.0446397361302}, 'faceExp': '당황', 'bg': '숙박 및 거주공간'}, 'annot_C': {'boxes': {'maxX': 2474.230000000002, 'maxY': 1766.9074109310811, 'minX': 1772.8026000000011, 'minY': 842.5360180162875}, 'faceExp': '당황', 'bg': '숙박 및 거주공간'}}


In [8]:
print(data[40000])

{'filename': '3df5a5a4ca0b4dd57db2d3ac1284c74c5a7b66e0d47ebc5522d5361de8940ada_여_30_기쁨_상업시설&점포&시장_20210122161328-003-022.jpg', 'gender': '여', 'age': 30, 'isProf': '일반인', 'faceExp_uploader': '기쁨', 'bg_uploader': '상업시설/점포/시장', 'annot_A': {'boxes': {'maxX': 2074.0167999999994, 'maxY': 2018.5946, 'minX': 1275.454, 'minY': 918.8821}, 'faceExp': '기쁨', 'bg': '상업시설/점포/시장'}, 'annot_B': {'boxes': {'maxX': 2074.0168000000003, 'maxY': 2018.5946, 'minX': 1275.4540000000002, 'minY': 918.8821}, 'faceExp': '기쁨', 'bg': '상업시설/점포/시장'}, 'annot_C': {'boxes': {'maxX': 2074.3103785516796, 'maxY': 2018.5589175816149, 'minX': 1312.9604214483209, 'minY': 912.617782418385}, 'faceExp': '기쁨', 'bg': '상업시설/점포/시장'}}


In [14]:
import json
import os
from PIL import Image
from tqdm import tqdm
import time

def crop_and_resize_and_save_face_2(json_path, file_dir_path, save_dir, cropped_shape=(256, 256)):
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)
    
    data_dict = {item["filename"]: item for item in data}

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    processed_count = 0
    error_count = 0

    for filename in tqdm(os.listdir(file_dir_path), desc="Processing"):
        file_path = os.path.join(file_dir_path, filename)

        if filename in data_dict:
            item = data_dict[filename]

            # 업로더와 검수자 간의 일치 여부 확인
            uploader_emotion = item["faceExp_uploader"]
            annotations = [item.get("annot_A", {}).get("faceExp", ""),
                           item.get("annot_B", {}).get("faceExp", ""),
                           item.get("annot_C", {}).get("faceExp", "")]
            
            # 일치하는 감정의 수 확인
            matching_emotions = sum(emotion == uploader_emotion for emotion in annotations)

            if matching_emotions >= 2: #2명 이상의 검수자가 업로더의 표정과 동의하는 경우만 사용
                try:
                    with Image.open(file_path) as img:
                        face_coords = item["annot_A"]["boxes"]
                        left = face_coords["minX"]
                        right = face_coords["maxX"]
                        top = face_coords["minY"]
                        bottom = face_coords["maxY"]

                        # 유효성 검사
                        if 0 <= left < right <= img.width and 0 <= top < bottom <= img.height:
                            cropped_img = img.crop((left, top, right, bottom))

                            if cropped_img.size != (0, 0):
                                resized_img = cropped_img.resize(cropped_shape)
                                temp_filename = f"{filename[:-5]}_cropped.jpg"
                                resized_img.save(os.path.join(save_dir, temp_filename))
                                processed_count += 1
                            else:
                                error_count += 1
                        else:
                            error_count += 1
                except Exception as e:
                    print(f"Error: {e}")
                    error_count += 1
    
    return processed_count, error_count


In [9]:
import os
import json
cropped_1 = "데이터 전처리/2차 검수/2차_검수_종합"
classes = ["분노","슬픔","불안","상처","당황","기쁨","중립"]
classes_dict = []

for class_ in classes:
    json_path =f"데이터 전처리\원천 데이터\[라벨]EMOIMG_{class_}_TRAIN\img_emotion_training_data({class_}).json"

    with open(json_path, 'r') as json_file:
        data = json.load(json_file)
    classes_dict.append({item["filename"].split(".")[0]: item for item in data})
    
# for img in os.listdir(cropped_1):
#     print(img)

In [32]:
from collections import Counter
def find_majority_agreement(annotations):
    """
    검수자들의 감정 중 최소 두 명 이상이 동의하는 감정을 찾는 함수.
    """
    emotion_counts = Counter(annotations)
    for emotion, count in emotion_counts.items():
        if count >= 2:
            return emotion
    return None

In [36]:
from tqdm import tqdm
two_or_more_agree_with_uploader = [0 for _ in range(7)]
two_or_more = [0 for _ in range(7)]
all_agree = [0 for _ in range(7)]

classes = ["분노","슬픔","불안","상처","당황","기쁨","중립"]

for img in tqdm(os.listdir(cropped_1)):
    temp = (img.split(".")[0]).split("_")
    filename = "_".join(temp[1:])
    label = int(temp[0])
    item = classes_dict[label][filename]

    # 업로더의 감정을 검수자 두명 이상이서 동의
    uploader_emotion = item["faceExp_uploader"]
    annotations = [item.get("annot_A", {}).get("faceExp", ""),
                    item.get("annot_B", {}).get("faceExp", ""),
                    item.get("annot_C", {}).get("faceExp", "")]
    majority = find_majority_agreement(annotations)
    if majority and majority != "알수없음":
        if uploader_emotion == majority:
            if annotations.count(majority) ==  3:
                all_agree[label] += 1
            two_or_more_agree_with_uploader[label] += 1
        two_or_more[classes.index(majority)] += 1


print(two_or_more_agree_with_uploader)
print(two_or_more)
print(all_agree)


100%|██████████| 73326/73326 [00:00<00:00, 102398.53it/s]

[7439, 6782, 4124, 2103, 8243, 10326, 10387]
[8834, 9675, 6084, 3544, 10628, 10872, 13420]
[4995, 4246, 1447, 571, 5372, 9699, 7910]





### 2차 데이터 전처리



In [37]:
classes = ["분노","슬픔","불안","상처","당황","기쁨","중립"]
processed_list = []
error_list = []
start = time.time()
for class_index,class_ in enumerate(classes):
    json_path = f"데이터 전처리\원천 데이터\[라벨]EMOIMG_{class_}_TRAIN\img_emotion_training_data({class_}).json"
    file_dir_path = f"데이터 전처리\원천 데이터\[원천]EMOIMG_{class_}_TRAIN_04"
    save_dir = "Training/Cropped_Images2"
    (processed_count, error_count) = crop_and_resize_and_save_face(json_path, file_dir_path, save_dir,label = class_index,cropped_shape=(256,256))
    print(processed_count, error_count)
    processed_list.append(processed_count)
    error_list.append(error_count)
end = time.time()

print(f"걸린 시간: {round(end-start,2)}s")
for class_,processed_count,error_count in zip(classes,processed_list,error_list):
    print(class_,f"total_count = {processed_count}, error_count = {error_count}")

NameError: name 'time' is not defined