In [None]:
from pathlib import Path

import numpy as np
import plotly.express as px
import pandas as pd


px.defaults.template = "seaborn"

dataset_path = Path('/Users/gianlucagiudice/Desktop/LogoDet-3K')

In [None]:
import os

sub_category_list = []

for category in os.listdir(dataset_path):
    if not (dataset_path / category).is_dir():
        continue
    for sub_category in os.listdir(dataset_path / category):
        path = (dataset_path / category / sub_category)
        if not path.is_dir():
            continue
        sub_category_list.append(str(dataset_path/category/sub_category))


In [None]:
print(f"Number of directory: {len(sub_category_list)}")

In [None]:
metadata_list = [Path(category) / metadata for category in sub_category_list for metadata in os.listdir(Path(category)) if (Path(category) / metadata).suffix == '.xml']

In [None]:
print(metadata_list[:10])

In [None]:
from tqdm import tqdm

import xml.etree.ElementTree as ET


brands = []
for metadata in tqdm(metadata_list, total=len(metadata_list)):
    tree = ET.parse(metadata)
    tree = tree.getroot()
    brands += [obj.find('name').text for obj in tree.findall('object')]


In [None]:
print(f"Number of cropped logos: {len(brands)}")
print(f"Number of unique brands: {len(set(brands))}")


In [None]:
df = pd.DataFrame(brands, columns=['brand'])
df.head()

### Categories statistics

In [None]:
df_metadata_cropped = pd.read_pickle('dataset/LogoDet-3K/metadata_cropped_images.pickle')
df_metadata_full = pd.read_pickle('dataset/LogoDet-3K/metadata_full_images.pickle')

In [None]:
print("Number of images per category")
images_per_category = df_metadata_cropped.groupby(by=['category']).count()['new_path']
images_per_category

In [None]:
df_metadata_cropped.groupby(by=['category'])['brand'].nunique()

In [None]:
len(sorted(df_metadata_cropped['brand'].unique()))

### Dataset statistics

In [None]:
fig = px.histogram(df, x="brand")
fig.update_xaxes(categoryorder='sum descending')

fig.update_layout(
    title_x=0.5,
    title_text='Number of objects', # title of plot
    xaxis_title_text='Brand', # xaxis label
    yaxis_title_text='Count', # yaxis label
)

fig.update_xaxes(tickangle=45)
fig.write_image("freq.jpeg", scale=3)
fig.show()

In [None]:
import collections
freq = collections.Counter(brands)
fig = px.box(data_frame=pd.DataFrame(freq.values(), columns=['brand']), x='brand', orientation='h')

fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 15,
        range=[0, 550],
        title='Count'
    ),
    yaxis = dict(
        title=''
    ),
    title_x=0.5,
    title_text='Brands frequencies boxplot',
)
fig.write_image("box_plot.jpeg", scale=3)
fig.show()

In [None]:
import numpy as np
np.quantile(sorted(freq.values()), q=[0.25, 0.50, 0.75, 1])