In [19]:
import json
import pandas as pd
import os
from tqdm import tqdm

def process_json_files(folder_path, max_files_per_folder=50000):
    data_list = []
    json_files = []

    # 폴더와 하위 폴더를 순환하면서 JSON 파일을 찾음
    for root, _, files in os.walk(folder_path):
        folder_files = [os.path.join(root, file) for file in files if file.endswith('.json')]
        # 각 폴더별로 최대 파일 개수 제한
        json_files.extend(folder_files[:max_files_per_folder])
    
    # tqdm을 사용하여 진행 상황 표시
    for file_path in tqdm(json_files, desc="Processing JSON files"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # '라벨링' 키가 존재하는지 확인
            if '라벨링' in data['데이터셋 정보']['데이터셋 상세설명']:
                labeling_info = data['데이터셋 정보']['데이터셋 상세설명']['라벨링']
                
                # '스타일' 키가 존재하는지 확인
                if '스타일' in labeling_info and labeling_info['스타일']:
                    style_info = labeling_info['스타일'][0]
                    style = style_info['스타일']
                    substyle = style_info.get('서브스타일', 'None')  # 서브스타일이 없는 경우 'None'으로 대체
                else:
                    style = 'Unknown'
                    substyle = 'None'
            else:
                style = 'Unknown'
                substyle = 'None'
            
            # 이미지 파일명 및 식별자 추출
            image_filename = data['이미지 정보'].get('이미지 파일명', 'Unknown')
            image_id = data['이미지 정보'].get('이미지 식별자', 'Unknown')
            
            # 데이터 리스트에 추가
            data_list.append([image_id, image_filename, style, substyle])
        
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            continue

    # 데이터프레임 생성
    df = pd.DataFrame(data_list, columns=['Image ID', 'Image Filename', 'Style', 'Substyle'])
    
    return df

# 예시 폴더 경로
folder_path = '../json/train'

# JSON 파일들을 처리하고 데이터프레임 생성
df = process_json_files(folder_path)

# 데이터프레임을 CSV 파일로 저장
output_csv_path = 'train_style_substyle_labels.csv'
df.to_csv(output_csv_path, index=False)

print(f"Data saved to {output_csv_path}")


Processing JSON files:   0%|          | 1503/397555 [01:30<6:37:57, 16.59it/s] 


KeyboardInterrupt: 

In [21]:
import json
import pandas as pd
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_single_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # '라벨링' 키가 존재하는지 확인
        if '라벨링' in data['데이터셋 정보']['데이터셋 상세설명']:
            labeling_info = data['데이터셋 정보']['데이터셋 상세설명']['라벨링']
            
            # '스타일' 키가 존재하는지 확인
            if '스타일' in labeling_info and labeling_info['스타일']:
                style_info = labeling_info['스타일'][0]
                style = style_info['스타일']
                substyle = style_info.get('서브스타일', 'None')  # 서브스타일이 없는 경우 'None'으로 대체
            else:
                style = 'Unknown'
                substyle = 'None'
        else:
            style = 'Unknown'
            substyle = 'None'
        
        # 이미지 파일명 및 식별자 추출
        image_filename = data['이미지 정보'].get('이미지 파일명', 'Unknown')
        image_id = data['이미지 정보'].get('이미지 식별자', 'Unknown')
        
        return [image_id, image_filename, style, substyle]
    
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

def process_json_files(folder_path, max_files_per_folder=50000):
    data_list = []
    json_files = []

    # 폴더와 하위 폴더를 순환하면서 JSON 파일을 찾음
    for root, _, files in os.walk(folder_path):
        folder_files = [os.path.join(root, file) for file in files if file.endswith('.json')]
        # 각 폴더별로 최대 파일 개수 제한
        json_files.extend(folder_files[:max_files_per_folder])
    
    # ThreadPoolExecutor를 사용하여 병렬 처리
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_single_file, file_path): file_path for file_path in json_files}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing JSON files"):
            result = future.result()
            if result:
                data_list.append(result)

    # 데이터프레임 생성
    df = pd.DataFrame(data_list, columns=['Image ID', 'Image Filename', 'Style', 'Substyle'])
    
    return df

# 예시 폴더 경로
folder_path = '../json/train'

# JSON 파일들을 처리하고 데이터프레임 생성
df = process_json_files(folder_path)

# 데이터프레임을 CSV 파일로 저장
output_csv_path = 'train_style_substyle_labels.csv'
df.to_csv(output_csv_path, index=False)

print(f"Data saved to {output_csv_path}")


Processing JSON files: 100%|██████████| 397555/397555 [30:18<00:00, 218.63it/s] 


Data saved to train_style_substyle_labels.csv


In [22]:
df

Unnamed: 0,Image ID,Image Filename,Style,Substyle
0,1015168,1107-14428-14428 (2-1)-20200129-1 (652).JPG,로맨틱,
1,1013256,"없음-1539-1539-set166(1)-다다(대행,링링) 4.14-8-IMG_89...",로맨틱,
2,1008958,없음-1490-1490-bl674(3)-중국사입(1030A#) 4.14-1-IMG_...,로맨틱,스트리트
3,1015167,1107-14428-14428 (2-1)-20200129-1 (651).JPG,로맨틱,
4,1013255,"없음-1539-1539-set166(1)-다다(대행,링링) 4.14-8-IMG_89...",로맨틱,
...,...,...,...,...
397550,496463,ns213k05_01.jpg,페미닌,리조트
397551,496471,ns213k05_s-9.jpg,페미닌,리조트
397552,492218,jk2328k06_01.jpg,페미닌,
397553,491465,jk2220k03-600_1.jpg,페미닌,로맨틱


In [2]:
import pandas as pd
df = pd.read_csv("train_style_substyle_labels.csv")
df.head()

Unnamed: 0,Image ID,Image Filename,Style,Substyle
0,1015168,1107-14428-14428 (2-1)-20200129-1 (652).JPG,로맨틱,
1,1013256,"없음-1539-1539-set166(1)-다다(대행,링링) 4.14-8-IMG_89...",로맨틱,
2,1008958,없음-1490-1490-bl674(3)-중국사입(1030A#) 4.14-1-IMG_...,로맨틱,스트리트
3,1015167,1107-14428-14428 (2-1)-20200129-1 (651).JPG,로맨틱,
4,1013255,"없음-1539-1539-set166(1)-다다(대행,링링) 4.14-8-IMG_89...",로맨틱,


In [6]:
import pandas as pd
df = pd.read_csv("mumumumu.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Image ID,Image Filename,Style,Substyle
0,0,1000,VanillaSpoon_090_01.jpg,페미닌,
1,1,100026,Ave29th_160_02.jpg,로맨틱,
2,2,100027,Ave29th_160_03.jpg,로맨틱,
3,3,100029,Ave29th_160_05.jpg,로맨틱,
4,4,1000316,20180309(대행)13-jk1004(829)IMG_0006.jpg,로맨틱,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397555 entries, 0 to 397554
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Image ID        397555 non-null  int64 
 1   Image Filename  397555 non-null  object
 2   Style           397555 non-null  object
 3   Substyle        206721 non-null  object
dtypes: int64(1), object(3)
memory usage: 12.1+ MB


In [4]:
df['Style'].unique()

array(['페미닌', '로맨틱', '리조트', '모던', '스트리트', '스포티', '톰보이', '매니시', '젠더리스'],
      dtype=object)

In [8]:
import pandas as pd

# CSV 파일 로드
#file_path = 'train_style_substyle_labels.csv'
file_path ='mumumumu.csv'
df = pd.read_csv(file_path)

# Style과 Substyle의 고유 값
style_unique = df['Style'].unique()
substyle_unique = df['Substyle'].unique()

# Style에 없는 Substyle 찾기
invalid_substyles = [substyle for substyle in substyle_unique if substyle not in style_unique]

# 잘못된 Substyle을 NaN으로 변경
df['Substyle'] = df['Substyle'].apply(lambda x: pd.NA if x in invalid_substyles else x)

# 변경된 DataFrame을 새로운 CSV 파일로 저장
output_csv_path = 'train_style_substyle_labels_modified.csv'
df.to_csv(output_csv_path, index=False)



In [9]:
df_new = pd.read_csv('train_style_substyle_labels_modified.csv')
df_new['Substyle'].unique()

array([nan, '스트리트', '모던', '페미닌', '리조트', '로맨틱', '젠더리스', '매니시', '톰보이',
       '스포티'], dtype=object)

In [13]:
len(df_new['Substyle'].unique())

10

In [11]:
df_new[df_new['Image ID'] == 1015168]

Unnamed: 0.1,Unnamed: 0,Image ID,Image Filename,Style,Substyle
1907,1907,1015168,1107-14428-14428 (2-1)-20200129-1 (652).JPG,로맨틱,


In [12]:
df_new

Unnamed: 0.1,Unnamed: 0,Image ID,Image Filename,Style,Substyle
0,0,1000,VanillaSpoon_090_01.jpg,페미닌,
1,1,100026,Ave29th_160_02.jpg,로맨틱,
2,2,100027,Ave29th_160_03.jpg,로맨틱,
3,3,100029,Ave29th_160_05.jpg,로맨틱,
4,4,1000316,20180309(대행)13-jk1004(829)IMG_0006.jpg,로맨틱,
...,...,...,...,...,...
152459,152459,999747,20180309(대행)13-jk1004(260)IMG_0006.jpg,매니시,스트리트
152460,152460,999756,20180309(대행)13-jk1004(269)IMG_0006.jpg,톰보이,
152461,152461,999761,20180309(대행)13-jk1004(274)IMG_0006.jpg,톰보이,
152462,152462,999762,20180309(대행)13-jk1004(275)IMG_0006.jpg,톰보이,


In [14]:
import pandas as pd

# Load the provided CSV file
file_path = "train_style_substyle_labels_modified.csv"
data = pd.read_csv(file_path)

# Fill NaN values in 'Substyle' with 'None'
data['Substyle'].fillna('None', inplace=True)

# Get unique styles and substyles
styles = data['Style'].unique()
substyles = data['Substyle'].unique()

# Create columns for each style and substyle, initializing with 0
for style in styles:
    data[style] = 0

for substyle in substyles:
    data[substyle] = 0

# Set the corresponding columns to 1 where the style and substyle match
for index, row in data.iterrows():
    data.at[index, row['Style']] = 1
    data.at[index, row['Substyle']] = 1

# Drop the original 'Style' and 'Substyle' columns
multi_label_df = data.drop(['Style', 'Substyle'], axis=1)


In [7]:
multi_label_df

Unnamed: 0,Image ID,Image Filename,로맨틱,리조트,톰보이,매니시,젠더리스,모던,소피스트케이티드,스트리트,스포티,컨트리,클래식,페미닌,None
0,1015168,1107-14428-14428 (2-1)-20200129-1 (652).JPG,1,0,0,0,0,0,0,0,0,0,0,0,1
1,1013256,"없음-1539-1539-set166(1)-다다(대행,링링) 4.14-8-IMG_89...",1,0,0,0,0,0,0,0,0,0,0,0,1
2,1008958,없음-1490-1490-bl674(3)-중국사입(1030A#) 4.14-1-IMG_...,1,0,0,0,0,0,0,1,0,0,0,0,0
3,1015167,1107-14428-14428 (2-1)-20200129-1 (651).JPG,1,0,0,0,0,0,0,0,0,0,0,0,1
4,1013255,"없음-1539-1539-set166(1)-다다(대행,링링) 4.14-8-IMG_89...",1,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397550,496463,ns213k05_01.jpg,0,1,0,0,0,0,0,0,0,0,0,1,0
397551,496471,ns213k05_s-9.jpg,0,1,0,0,0,0,0,0,0,0,0,1,0
397552,492218,jk2328k06_01.jpg,0,0,0,0,0,0,0,0,0,0,0,1,1
397553,491465,jk2220k03-600_1.jpg,1,0,0,0,0,0,0,0,0,0,0,1,0


In [15]:
multi_label_df.to_csv('multi_label_dataset_cut.csv')