In [78]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [79]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import cv2
from google.colab.patches import cv2_imshow

from tqdm import tqdm

import time
import shutil
import os
import warnings
warnings.filterwarnings(action='ignore')

In [80]:
csv_path = '/content/drive/MyDrive/project3/data/traindata/read_file'

csv_files = os.listdir(csv_path)

# Filter out the CSV files
csv_files = [file for file in csv_files if file.endswith('.csv')]

# Initialize an empty list to hold the dataframes
temp = []

# Read each CSV file and append to the list
for csv_file in csv_files:
    file_path = os.path.join(csv_path, csv_file)
    df = pd.read_csv(file_path)
    temp.append(df)


In [81]:
# Concatenate all dataframes
df = pd.concat(temp, ignore_index=True)
df = df[ (df['type'] == 'crop') | (df['type'] == 'raw')]
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,file_path,type
0,/content/drive/MyDrive/project3/data/traindata...,raw
1,/content/drive/MyDrive/project3/data/traindata...,raw
2,/content/drive/MyDrive/project3/data/traindata...,raw
3,/content/drive/MyDrive/project3/data/traindata...,raw
4,/content/drive/MyDrive/project3/data/traindata...,raw


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150283 entries, 0 to 150282
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   file_path  150283 non-null  object
 1   type       150283 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [83]:
# 중분류
df['file_path'][0].replace('/content/drive/MyDrive/project3/data/traindata/raw_image/', '').split('/')[0]

'구이'

In [84]:
# 소분류
df['file_path'][0].replace('/content/drive/MyDrive/project3/data/traindata/raw_image/', '').split('/')[1]

'갈비구이'

In [85]:
df['middle_class'] = df.apply(lambda row: row['file_path'].replace('/content/drive/MyDrive/project3/data/traindata/raw_image/', '').split('/')[0]
                              if row['type'] == 'raw'
                              else row['file_path'].replace('/content/drive/MyDrive/project3/data/traindata/crop_image/', '').split('/')[0], axis=1)
df['small_class'] = df.apply(lambda row: row['file_path'].replace('/content/drive/MyDrive/project3/data/traindata/raw_image/', '').split('/')[1]
                              if row['type'] == 'raw'
                              else row['file_path'].replace('/content/drive/MyDrive/project3/data/traindata/crop_image/', '').split('/')[1], axis=1)
df['food_class'] = df['middle_class'] + '/' + df['small_class']
df['group_number'] = df.groupby('food_class').cumcount() + 1
df.head()

Unnamed: 0,file_path,type,middle_class,small_class,food_class,group_number
0,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,1
1,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,2
2,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,3
3,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,4
4,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,5


In [86]:
middle_class = df['middle_class'].unique().tolist()
small_class = df['small_class'].unique().tolist()
food_class = df['food_class'].unique().tolist()
print(food_class)
print(len(food_class))

['구이/갈비구이', '구이/고등어구이', '구이/곱창구이', '구이/닭갈비', '구이/더덕구이', '구이/갈치구이', '구이/떡갈비', '구이/불고기', '구이/삼겹살', '구이/장어구이', '구이/조개구이', '구이/조기구이', '구이/황태구이', '구이/훈제오리', '국/계란국', '국/떡국_만두국', '국/무국', '국/미역국', '국/북엇국', '국/시래기국', '국/육개장', '국/콩나물국', '기타/과메기', '기타/양념치킨', '기타/젓갈', '기타/콩자반', '기타/편육', '기타/피자', '기타/후라이드치킨', '김치/갓김치', '김치/깍두기', '김치/나박김치', '김치/무생채', '김치/배추김치', '김치/백김치', '김치/부추김치', '김치/열무김치', '김치/오이소박이', '김치/총각김치', '김치/파김치', '나물/가지볶음', '나물/고사리나물', '나물/미역줄기볶음', '나물/숙주나물', '나물/시금치나물', '나물/애호박볶음', '떡/경단', '떡/꿀떡', '떡/송편', '만두/만두', '면/라면', '면/막국수', '면/물냉면', '면/비빔냉면', '면/수제비', '면/열무국수', '며

In [87]:
## Crop영역 정보를 dictionary로
def crop_area_dict(raw_image_path):
    crop_file_name = 'crop_area.properties'
    crop_file_path = os.path.join(raw_image_path, crop_file_name)

    crop_areas = {}
    with open(crop_file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        for line in lines:
            key, value = line.strip().split('=')
            try:
                coordinates = list(map(int, value.split(',')))
                crop_areas[key] = coordinates
            # crop coordinate가 오류가 발생하는 경우 coordinate를 None으로
            except ValueError:
                crop_areas[key] = None
    return crop_areas

In [88]:
dir = '/content/drive/MyDrive/project3/data/traindata/raw_image'
food_middle_list = sorted(os.listdir(dir))

food_list = [] # 음식이름
food_path_list = [] # 음식이름 경로

for food_middle in food_middle_list:
    middle_path = os.path.join(dir,food_middle)
    food_name_list = os.listdir(middle_path)

    for food in food_name_list:
        path = os.path.join(middle_path,food)
        food_path_list.append(path)

        name = path.split('/')[-1]
        food_list.append(name)



In [89]:
crop_area = {}
for food_path in food_path_list:
    temp_crop_area = crop_area_dict(food_path)

    if food_path == '/content/drive/MyDrive/project3/data/traindata/raw_image/한과/약식':
        temp_crop_area_updated = {}
        for key, value in temp_crop_area.items():
            if key.startswith('Img_145_'):
                new_key = key.replace('Img_145_', 'Img_144_', 1)
                temp_crop_area_updated[new_key] = value
            else:
                temp_crop_area_updated[key] = value
        temp_crop_area = temp_crop_area_updated

    crop_area.update(temp_crop_area)



In [90]:
print(len(crop_area))

52068


In [91]:
df_crop_area = pd.DataFrame(crop_area.items(), columns=['file_name', 'crop_area'])
df_crop_area.head()

Unnamed: 0,file_name,crop_area
0,Img_003_0008,"[43, 0, 461, 144]"
1,Img_003_0028,"[252, 38, 323, 197]"
2,Img_003_0039,"[69, 8, 453, 211]"
3,Img_003_0066,"[163, 122, 313, 183]"
4,Img_003_0087,"[80, 61, 371, 337]"


In [92]:
temp = df[df['type'] == 'crop']
temp.reset_index(drop=True, inplace=True)
temp.head()

Unnamed: 0,file_path,type,middle_class,small_class,food_class,group_number
0,/content/drive/MyDrive/project3/data/traindata...,crop,구이,갈비구이,구이/갈비구이,75
1,/content/drive/MyDrive/project3/data/traindata...,crop,구이,갈비구이,구이/갈비구이,78
2,/content/drive/MyDrive/project3/data/traindata...,crop,구이,갈비구이,구이/갈비구이,79
3,/content/drive/MyDrive/project3/data/traindata...,crop,구이,갈비구이,구이/갈비구이,80
4,/content/drive/MyDrive/project3/data/traindata...,crop,구이,갈비구이,구이/갈비구이,81


In [93]:
temp['file_path'][0]

'/content/drive/MyDrive/project3/data/traindata/crop_image/구이/갈비구이/Img_000_0581_crop.jpg'

In [94]:
temp['file_path'][0].split('/')[-1]

'Img_000_0581_crop.jpg'

In [95]:
test_key = temp['file_path'][0].split('/')[-1].split('.')[0].replace('_crop', '')
test_key

'Img_000_0581'

In [96]:
crop_area[test_key]

[255, 119, 123, 157]

In [97]:
df['file_name_key'] = df['file_path'].apply(lambda x: x.split('/')[-1].split('.')[0].replace('_crop', ''))


In [98]:
merged_df = df.merge(df_crop_area,
                     how='left',
                     left_on='file_name_key',
                     right_on='file_name')
merged_df.head()

Unnamed: 0,file_path,type,middle_class,small_class,food_class,group_number,file_name_key,file_name,crop_area
0,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,1,Img_000_0308,,
1,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,2,Img_000_0438,,
2,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,3,Img_000_0134,,
3,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,4,Img_000_0972,,
4,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,5,Img_000_0278,,


In [99]:
merged_df['crop_area'].notna().sum()

51353

In [100]:
(merged_df['type'] == 'crop').sum()


51353

In [101]:
merged_df.drop(columns=['file_name', 'file_name_key'], inplace=True)
merged_df.head()

Unnamed: 0,file_path,type,middle_class,small_class,food_class,group_number,crop_area
0,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,1,
1,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,2,
2,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,3,
3,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,4,
4,/content/drive/MyDrive/project3/data/traindata...,raw,구이,갈비구이,구이/갈비구이,5,


In [102]:
merged_df.to_csv('/content/drive/MyDrive/project3/data/traindata/read_image.csv', index=False)
merged_df.to_pickle('/content/drive/MyDrive/project3/data/traindata/read_image.pkl')