In [None]:
import os
import ipysheet

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from functools import partial
from ipywidgets import Label, BoundedIntText, IntProgress, Image, Button, HBox, VBox, Layout
from IPython.display import display, clear_output

tqdm.pandas()

In [None]:
class Batcher:
    def __init__(self, skip=0, limit=5, count=100):
        self.skip = skip
        self.limit = limit
        self.count = count
        self.current = 1
        self.end = round(count/limit)
    
    def increment(self):
        if self.current < self.end:
            self.current += 1
            self.skip += self.limit
    
    def decrement(self):
        if self.current > 1:
            self.current -= 1
            self.skip -= self.limit
    
    def __iter__(self, sentinel=False):
        return iter(self.increment, sentinel)

    
def load_data():
    if os.path.isfile("modified.csv"):
        return pd.read_csv("modified.csv", index_col=0)
        
    try:
        del(df)
    except NameError:
        pass

    data_dir = os.path.join('data','raw')
    folders = sorted([folder for folder in os.listdir(data_dir) if '.DS_Store' not in folder])
    for folder in folders:
        categories = [label for label in os.listdir(os.path.join(data_dir, folder)) if '.DS_Store' not in label]
        for category in categories:
            file = [file for file in os.listdir(os.path.join(data_dir, folder, category)) if '.DS_Store' not in file]
            path = [os.path.join(data_dir, folder, category, file) for file in 
                    os.listdir(os.path.join(data_dir, folder, category)) if '.DS_Store' not in file]
            label = [category for i in range(len(path))]
            split = [folder for i in range(len(path))]

            try:
                df = pd.concat([df, pd.DataFrame({"path": path, "file": file, "split": split, "label": label})])
            except NameError:
                df = pd.DataFrame({"path": path, "file": file, "split": split, "label": label})

    df.reset_index(drop=True, inplace=True)

    df['verified'] = df['label']
    df['legible'] = 1
    df['centered'] = 1
    df['difficult'] = 0
    df.to_csv("raw.csv")
    
    return df


def update_dataframe():
    global df
    global screen
    sheet = screen.children[1]
    paths = df[['path']].iloc[batcher.skip:batcher.skip+batcher.limit]
    update = pd.DataFrame(sheet.cells[1].value, columns=sheet.column_headers[1:], index=paths.index)
    tmp = pd.concat([
        paths,
        update
    ], axis=1)

    # Update head
    if batcher.skip == 0: 
        df = pd.concat([tmp, df.iloc[batcher.skip+batcher.limit:]], axis=0)
    # Update tail
    elif batcher.current == batcher.end: 
        df = pd.concat([df.iloc[:batcher.skip], tmp], axis=0)
    # Update mid
    else:
        df = pd.concat([df.iloc[:batcher.skip],tmp,df.iloc[batcher.skip+batcher.limit:]], axis=0)
    df.to_csv("modified.csv")


def generate_preview(file: str) -> Image:
    return Image(value=open(file, "rb").read(), format=file.split(".")[-1], width=80)

def generate_sheet_with_preview(df: pd.DataFrame, skip: int, limit: int, path: str="path") -> ipysheet.sheet:
    sheet = ipysheet.sheet(rows=limit, columns=len(df.columns[1:])+1, column_headers=["preview"]+list(df.columns[1:]))
    end = skip+limit if skip+limit < len(df) else len(df)
    preview = ipysheet.column(0, [generate_preview(file) for file in df[path].iloc[skip:end]])
    cells = ipysheet.cell_range(df[df.columns[1:]].iloc[skip:end].to_numpy(), column_start=1)

    return sheet

def prev_callback(counter, w):
    global screen
    batcher.decrement()
    clear_output()
    screen = render_widgets()
    display(screen)
    
def next_callback(counter, w):
    global screen
    update_dataframe()
    batcher.increment()
    clear_output()
    screen = render_widgets()
    display(screen)

def on_value_change(change):
    global screen
    batcher.current = change['new']
    batcher.skip = (batcher.current-1)*batcher.limit
    
    update_dataframe()
    clear_output()
    screen = render_widgets()
    display(screen)

def render_widgets():
    info1 = Label(value='Batch ')
    batch = BoundedIntText(value=batcher.current, min=1, max=batcher.end, description='', layout=Layout(max_width='50px'))
    batch.observe(on_value_change, names='value')
    
    info2 = Label(value=' of '+str(batcher.end))
    counter = IntProgress(value=batcher.current, min=1, max=batcher.end, description='', 
                          bar_style='success', orientation='horizontal')
    progress = HBox([counter, info1, batch, info2])

    prev_button = Button(
        description='Previous',
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Previous',
    )
    prev_button.on_click(partial(prev_callback, batcher))

    next_button = Button(
        description='Next',
        disabled=False,
        button_style='success', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Next',
    )
    next_button.on_click(partial(next_callback, batcher))

    buttons = HBox([prev_button, next_button])
    nav_bar = HBox([progress, buttons], layout=Layout(display='flex', flex_row='flex', justify_content='space-between'))
    spreadsheet = generate_sheet_with_preview(df, skip=batcher.skip, limit=batcher.limit)
    
    return VBox([nav_bar, spreadsheet])

In [None]:
df = load_data()
print("Count:", len(df))
df.head(5)

In [None]:
batcher = Batcher(count=len(df))
screen = render_widgets()
display(screen)

In [None]:
pd.set_option('precision', 2)
df_lc = df.groupby(['verified']).count()[['file']].sort_values(by=['file'], ascending=False)
df_lc['%'] = 100*df_lc.file / df_lc.sum()['file']
df_lc

In [None]:
df_ml = df[df.label != df.verified].groupby(['label', 'verified']).count()[['file']]
df_ml

In [None]:
df_cat = df.groupby(['legible','difficult','centered']).count()[['file']].sort_values(by='file', ascending=False)
df_cat['%'] = 100*df_cat.file / df_cat.sum()['file']
df_cat['cum-%'] = df_cat['%'].cumsum()
df_cat