# Data Prep notebook for LogoDetection-3K using Rikai
Paper: https://arxiv.org/pdf/2008.05359.pdf <br/>
Data: https://github.com/Wangjing1551/LogoDet-3K-Dataset

## Load data

In [None]:
!curl https://eto-public.s3.us-west-2.amazonaws.com/LogoDet-3K.zip --output LogoDet-3K.zip

In [None]:
!unzip -qo LogoDet-3K.zip
!mv LogoDet-3K data

In [None]:
# Check that the directory looks good and total size is about 4GB
!ls data
!du -csh data

In [None]:
!ls data/Clothes/Gap/

In [None]:
!cat data/Clothes/Gap/1.xml

In [None]:
from IPython.display import Image as IPyImage
IPyImage(filename='data/Clothes/Gap/1.jpg') 

In [None]:
# Verify that we only have jpg and xml files
from pathlib import Path
data = Path('data')
ext = set()
for cat_dir in data.iterdir():
    category = cat_dir.name
    for brand_dir in cat_dir.iterdir():
        brand = brand_dir.name
        for f in brand_dir.iterdir():
            ext.add(f.suffix)
ext       

## Create Rikai dataset

In [None]:
!pip install xmltodict

In [None]:
from pathlib import Path
import xmltodict
from rikai.types import Image, Box2d

data = Path('data')

def process_dataset(data_dir):
    rows = []
    for cat_dir in data_dir.iterdir():
        category = cat_dir.name
        for brand_dir in cat_dir.iterdir():
            brand = brand_dir.name
            for img_file in brand_dir.glob('*.jpg'):
                ann_file = img_file.parent / f"{img_file.stem}.xml"
                link = f'./{category}/{brand}/{img_file.stem}.{img_file.suffix}'
                row_dd = {'category': category, 'brand': brand, 'link': link, 'image': Image(img_file).to_embedded()}
                if ann_file.exists():
                    with open(ann_file) as xf:
                        ann_dd = xmltodict.parse(xf.read())
                        row_dd.update(process_ann(ann_dd['annotation']))
                rows.append(row_dd)
    return rows

In [None]:
def process_ann(annotations):
    objects = annotations['object']
    if isinstance(objects, dict):
        objects = [objects]
    return {
        'verified': annotations.get('@verified') == 'yes',
        'folder': annotations['folder'],
        'width': int(annotations['size']['width']),
        'height': int(annotations['size']['height']),
        'depth': int(annotations['size']['depth']),
        'segmented': annotations['segmented'] == '0',
        'annotations': [{
            'truncated': obj['truncated'] == '0',
            'bbox': Box2d(**{k: float(v) for k, v in obj['bndbox'].items()}),
            'label': obj['name']
        } for obj in objects]
    }

In [None]:
import pandas as pd

pdf = pd.DataFrame(process_dataset(data))

In [None]:
pdf

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
from rikai.spark.utils import get_default_jar_version
from rikai.spark.types import *

version = get_default_jar_version(use_snapshot=True)
spark = (
    SparkSession
    .builder
    .appName('rikai-quickstart')
    .config('spark.jars.packages', 
            "ai.eto:rikai_2.12:{}".format(version))
    .master('local[*]')
    .getOrCreate()
)

In [None]:
schema = StructType([
    StructField('category', StringType(), False),
    StructField('brand', StringType(), False),
    StructField('link', StringType(), False),
    StructField('image', ImageType(), False),
    StructField('verified', BooleanType(), True),
    StructField('folder', StringType(), True),
    StructField('width', IntegerType(), True),
    StructField('height', IntegerType(), True),
    StructField('depth', IntegerType(), True),
    StructField('segmented', BooleanType(), True),
    StructField('annotations', ArrayType(
        StructType([
            StructField('truncated', BooleanType(), True),
            StructField('bbox', Box2dType(), False),
            StructField('label', StringType(), False)
        ])
    ), True)
])
         
    
df = spark.createDataFrame(pdf, schema)
df.printSchema()
df.show()

In [None]:
!mkdir -p ./rikai
df.repartition(20).write.format('rikai').mode('overwrite').save('./rikai/logo3k')