# Data EDA(Object Detection관점)

In [None]:
from pycocotools.coco import COCO

import numpy as np
from pandas import DataFrame as df

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")

In [1]:
coco = COCO('/opt/ml/input/data/train.json')

# bbox EDA

In [3]:
image_ids = coco.getImgIds()

In [2]:
len(image_ids) # 총 이미지 사진의 개수

In [5]:
image_id = image_ids[0]

In [3]:
# 이미지 한장의 정보 불러오기
image_infos = coco.loadImgs(image_id)[0]
print(image_infos)

In [4]:
coco.loadImgs(image_id)

In [5]:
# 0번 id에 대한 이미지 어노테이션 가져오기
ann_ids = coco.getAnnIds(imgIds =0)
anns = coco.loadAnns(ann_ids)
len(anns)

In [6]:
print(anns[0]['bbox'])

In [9]:
print(type(anns[0]['segmentation']))

In [10]:
cat_ids = coco.getCatIds()
cat_ids

In [23]:
cats = coco.loadCats(cat_ids)

In [24]:
category_name_list = []

In [25]:
for cat in cats:
    category_name_list.append(cat['name'])

In [11]:
category_name_list 

In [27]:
# 이미지를 순회하며 카테고리마다 박스의 사이즈가 얼마나 되는지 조사

# 클래스 마다 박스 크기 저장
class_bbox_scale_list = []

for _ in range(11):
    class_bbox_scale_list.append([])

for image_id in image_ids:
    
    # 이미지에 대한 정보를 가져오기
    image_infos = coco.loadImgs(image_id)[0]
    image_id = image_infos['id'] # 이미지 아이디를 가져옴
    
    # annotation정보를 가져오기
    ann_ids = coco.getAnnIds(imgIds=image_id)
    anns = coco.loadAnns(ann_ids)
    
    # 카테고리 정보 가져오기
    for ann in anns:
        category_id = ann['category_id']
        bbox_information = ann['bbox']
        bbox_area = bbox_information[2] * bbox_information[3]
        class_bbox_scale_list[category_id].append(bbox_area)

In [12]:
# 이미지의 전체 크기
print(512 * 512)

## class별 bbox개수의 분포도

In [29]:
# 클래스마다의 bbox의 개수
class_bbox_count_list = []

for i in range(11):
    class_bbox_count_list.append(len(class_bbox_scale_list[i]))

In [13]:
class_bbox_count_list

In [14]:
bbox_stat = df(data= {"category" : category_name_list, "count" : class_bbox_count_list},
               columns = ["category", "count"]
              )
bbox_stat.head()

In [15]:
# 카테고리별 bbox개수 분포도
fig, ax1 = plt.subplots(1, 1, fcigsize = (30, 6))

sns.barplot(x = "category", y = "count",data = bbox_stat, ax = ax1)

In [33]:
category_name_list_all = ['UNKNOWN'] * class_bbox_count_list[0] + ['General trash'] * class_bbox_count_list[1] \
+ ['Paper'] * class_bbox_count_list[2] + ['Paper pack'] * class_bbox_count_list[3]\
+ ['Metal'] * class_bbox_count_list[4] + ['Glass'] * class_bbox_count_list[5]\
+ ['Plastic'] * class_bbox_count_list[6] + ['Styrofoam'] * class_bbox_count_list[7]\
+ ['Plastic bag'] * class_bbox_count_list[8] + ['Battery'] * class_bbox_count_list[9]\
+ ['Cloating'] * class_bbox_count_list[10]

In [34]:
assert len(category_name_list_all) == sum(class_bbox_count_list)

## class별 box size의 현황 분포 분석

In [35]:
class_bbox_scale_list_all = []

# class_bbox_scale_list

for i in range(len(class_bbox_scale_list)):
    for scale in class_bbox_scale_list[i]:
        class_bbox_scale_list_all.append(scale)

In [16]:
bbox_stat = df(data= {"category" : category_name_list_all, "bbox_size" : class_bbox_scale_list_all},
               columns = ["category", "bbox_size"]
              )



bbox_stat.head()

In [17]:
# 카테고리별 bbox크기 분포도
fig, ax1 = plt.subplots(1, 1, figsize = (20, 10))
sns.boxplot(x ="bbox_size", y="category", data = bbox_stat, palette = "muted", ax = ax1)

In [42]:
# unknown, Battery의 정확한 박스 크기 평균값 수치
unk_area_np_list = np.array(class_bbox_scale_list[0])
battery_area_np_list = np.array(class_bbox_scale_list[9])

In [18]:
print("unknown bbox 크기에 대한 정보")
print("count : ", len(unk_area_np_list))
print("mean of len : ", np.mean(unk_area_np_list))
print("std of len : ", np.std(unk_area_np_list))
print("max of len : ", np.max(unk_area_np_list))
print("min of len : ", np.min(unk_area_np_list))
print('제 1사분위 크기 : {}'.format(np.percentile(unk_area_np_list, 25)))
print('제 2사분위 크기 : {}'.format(np.percentile(unk_area_np_list, 50)))
print('제 3사분위 크기 : {}'.format(np.percentile(unk_area_np_list, 75)))

In [19]:
print("battery bbox 크기에 대한 정보")
print("count : ", len(battery_area_np_list))
print("mean of len : ", np.mean(battery_area_np_list))
print("std of len : ", np.std(battery_area_np_list))
print("max of len : ", np.max(battery_area_np_list))
print("min of len : ", np.min(battery_area_np_list))
print('제 1사분위 크기 : {}'.format(np.percentile(battery_area_np_list, 25)))
print('제 2사분위 크기 : {}'.format(np.percentile(battery_area_np_list, 50)))
print('제 3사분위 크기 : {}'.format(np.percentile(battery_area_np_list, 75)))

## class별 bbox & Segmentation의 비율 계산

In [49]:
# 이미지를 순회하며 카테고리마다 박스의 사이즈가 얼마나 되는지 조사

# 클래스 마다 박스 크기 저장
class_bbox_segmentation_ratio_list = []

for _ in range(11):
    class_bbox_segmentation_ratio_list.append([])

for image_id in image_ids:
    
    # 이미지에 대한 정보를 가져오기
    image_infos = coco.loadImgs(image_id)[0]
    image_id = image_infos['id'] # 이미지 아이디를 가져옴
    
    # annotation정보를 가져오기
    ann_ids = coco.getAnnIds(imgIds=image_id)
    anns = coco.loadAnns(ann_ids)
    
    # 카테고리 정보 가져오기
    for ann in anns:
        category_id = ann['category_id']
        bbox_information = ann['bbox']
        bbox_area = bbox_information[2] * bbox_information[3]
        bbox_segmentation_ratio = ann['area'] / bbox_area
            
        class_bbox_segmentation_ratio_list[category_id].append(bbox_segmentation_ratio)

In [50]:
# 클래스마다의 bbox의 개수
class_bbox_count_list = []

for i in range(11):
    class_bbox_count_list.append(len(class_bbox_segmentation_ratio_list[i]))

In [20]:
class_bbox_count_list # 갯수 일치

In [52]:
class_bbox_segmentation_ratio_list_all = []

for i in range(len(class_bbox_segmentation_ratio_list)):
    for scale in class_bbox_segmentation_ratio_list[i]:
        class_bbox_segmentation_ratio_list_all.append(scale)

In [21]:
bbox_seg_stat = df(data= {"category" : category_name_list_all, "bbox_seg_ratio" : class_bbox_segmentation_ratio_list_all},
               columns = ["category", "bbox_seg_ratio"]
              )



bbox_seg_stat.head()

In [22]:
# 카테고리별 bbox크기 분포도
fig, ax1 = plt.subplots(1, 1, figsize = (20, 10))
sns.boxplot(x ="bbox_seg_ratio", y="category", data = bbox_seg_stat, palette = "muted", ax = ax1)

## coco 라이브러리 연습코드

In [22]:
image_id = coco.getImgIds(imgIds=0)

In [23]:
image_id

In [29]:
image_infos = coco.loadImgs(image_id)[0]

In [24]:
image_infos

In [98]:
ann_ids = coco.getAnnIds(imgIds=9)

In [25]:
ann_ids # 이미지에 해당하는 어노테이이션 가져오기

In [33]:
anns = coco.loadAnns(ann_ids)

In [26]:
len(anns)

In [38]:
# anns[1] # annotation에 해당하는 정보 가져오기

In [41]:
cat_ids = coco.getCatIds()

In [27]:
cat_ids

In [43]:
cats = coco.loadCats(cat_ids)

In [28]:
len(cats)

In [29]:
cats[0]

In [46]:
%matplotlib inline
from pycocotools.coco import COCO
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab


In [30]:
coco = COCO('/opt/ml/input/data/val.json')

In [31]:
cat_ids = coco.getCatIds()
print(cat_ids)

In [50]:
cats = coco.loadCats(cat_ids)

In [32]:
cats

In [52]:
nms = [cat['name'] for cat in cats]

In [33]:
print('COCO categories: \n{}\n'.format(' '.join(nms)))

In [34]:
nms = set([cat['supercategory'] for cat in cats])
print('COCO supercategories: \n{}'.format(' '.join(nms)))

In [123]:
catIds = coco.getCatIds(catNms = ['Battery'])

In [35]:
catIds

In [125]:
imgIds = coco.getImgIds(catIds= catIds)

In [36]:
imgIds

In [127]:
image_infos1 = coco.loadImgs(imgIds[0])[0]
image_infos2 = coco.loadImgs(imgIds[1])[0]

In [37]:
image_infos1

In [38]:
image_infos2

In [39]:
root_path = "/opt/ml/input/data/"

I = io.imread(root_path + image_infos1['file_name'])
plt.axis('off')
plt.imshow(I)
plt.show()

In [40]:
root_path = "/opt/ml/input/data/"

I = io.imread(root_path + image_infos2['file_name'])
plt.axis('off')
plt.imshow(I)
plt.show()

In [41]:
catIds = coco.getCatIds(catNms=['Battery']);
imgIds = coco.getImgIds(catIds=catIds);
print(catIds)
print(imgIds)

In [42]:
I = io.imread(root_path + image_infos1['file_name'])
plt.axis('off')
plt.imshow(I)
plt.show()

plt.imshow(I)
plt.axis('off')
annIds = coco.getAnnIds(imgIds=image_infos1['id'], catIds = catIds,iscrowd=None)
anns = coco.loadAnns(annIds)
coco.showAnns(anns)

In [43]:
I = io.imread(root_path + image_infos2['file_name'])
plt.axis('off')
plt.imshow(I)
plt.show()

plt.imshow(I)
plt.axis('off')
annIds = coco.getAnnIds(imgIds=image_infos2['id'], catIds = catIds,iscrowd=None)
anns = coco.loadAnns(annIds)
coco.showAnns(anns)

In [111]:
annIds = coco.getAnnIds(imgIds=19,iscrowd=None)

In [44]:
annIds

In [113]:
annIds = coco.getAnnIds(imgIds=9,catIds = [9],iscrowd=None)

In [45]:
annIds

In [46]:
catIds = coco.getCatIds(catNms = ['UNKNOWN'])
catIds

In [47]:
imgIds = coco.getImgIds(catIds= catIds)
imgIds

In [151]:
image_infos_list = []

for i in range(10):
    image_infos_list.append(coco.loadImgs(imgIds[i])[0])


In [48]:

for i in range(10):
    I = io.imread(root_path + image_infos_list[i]['file_name'])
    plt.axis('off')
    plt.imshow(I)
    plt.show()

    plt.imshow(I)
    plt.axis('off')
    annIds = coco.getAnnIds(imgIds=image_infos_list[i]['id'], catIds = catIds,iscrowd=None)
    anns = coco.loadAnns(annIds)
    coco.showAnns(anns)
    plt.show()

## validation set 구성 연습(사용하지는 않음)

In [49]:
coco = COCO('/opt/ml/input/data/val.json')

In [50]:
help(coco)

In [51]:
temp = coco.info()

In [173]:
temp

In [52]:
print(temp)

In [175]:
import json
from sklearn.utils import Bunch

In [191]:
with open('/opt/ml/input/data/val.json') as f:
    hparams = json.load(f)

In [204]:
config = Bunch()
config.update(hparams)

In [53]:
config.info

In [206]:
# config

In [54]:
type(config)

In [55]:
type(config.info)

In [209]:
new_json = {}
new_json['root'] = {}
new_json['root']['info'] = config.info

In [56]:
new_json

In [211]:
# dictionary를 json으로 변환

# json_val = json.dumps(new_json)

In [212]:
# type(json_val)

In [219]:
with open("/opt/ml/test.json", 'w') as outfile:
    json.dump(json_val, outfile)

In [228]:
# # 방법1) json.dumps
# dic1 = {'ramyeon':{'a' : 1, 'b' : 2},'noodle':['ramyeon','ramen']}
# obj1 = json.dumps(dic1)

# print(type(obj1))""
# obj1

In [229]:
# with open('/opt/ml/test.json','w') as f:
#     json.dump(dic1,f)

In [57]:
dict1 = {'info' : config.info, 'licenses': config.licenses, 'categories' : config.categories}
obj1 = json.dumps(dict1)

print(type(obj1))
obj1

In [247]:
with open('/opt/ml/test.json','w') as f:
    json.dump(dict1,f)

In [58]:
cat_ids = coco.getCatIds()
print(cat_ids)

In [249]:
cats = coco.loadCats(cat_ids)

In [59]:
cats

In [252]:
nms = [cat['name'] for cat in cats]

In [60]:
nms