###  5 - Performance results

In [104]:
import pandas as pd
import json
import os
import io
from shutil import copyfile
import re
import itertools
from google.cloud import vision
from datetime import datetime
import numpy as np
from google.protobuf.json_format import MessageToDict
pd.set_option('display.max_rows', 10)

### Compare results of algorithm with ground truth (entered manually)

In [105]:
def found(x):
    '''return if x is NaN or False'''
    if x:
        try:
            if np.isnan(x):
                return 0
            else:
                return 1
        except:
            return 1
    else:
        return 0
    
def to_perc(x):
    '''convert int or float to percentage'''
    try:
        return str(round(x*100))+'%'
    except:
        return x
    
def compare_algo_true(df_mill_algo, df_mill_true, columns_comp):
    '''compare dataframe built using the image recognition algorithm and true dataframe entered manually
    compare the columns columns_comp which have to be in both dataframes'''
    entry_found = df_mill_algo[columns_comp].applymap(lambda x: found(x))
    entry_found = pd.DataFrame(entry_found.apply(np.sum), columns=['Percentage of found entries'])                  
    entry_found = entry_found.apply(lambda x: x/len(df_mill_algo)).reset_index()
    
    entry_true = df_mill_algo[columns_comp] == df_mill_true[columns_comp]
    entry_true = pd.DataFrame(entry_true.apply(np.sum), columns=['Percentage of true entries'])
    entry_true = entry_true.apply(lambda x: x/len(df_mill_algo)).reset_index()

    df_res = entry_found.merge(entry_true)
    df_res['True entries among found entries'] = df_res['Percentage of true entries']/df_res['Percentage of found entries']
    df_res.set_index('index')
    df_res.loc[len(df_res)] = df_res.mean()
    df_res = pd.DataFrame(df_res).set_index('index').applymap(lambda x: to_perc(x))
    df_res.index = list(df_res.index[:-1]) + ['Average accuracy']
    df_res = df_res.reset_index()
    nbReceipts = 'n='+str(len(df_mill_true))+' receipts'
    df_res.columns = [nbReceipts] + list(df_res.columns[1:])
    df_res = df_res.set_index(nbReceipts)
    return df_res

### Save comparison in csv

In [106]:
def rm_sec(x):
    '''remove seconds from time'''
    try:
        return x[:-3]
    except:
        return x
    
def compare_algo_true_mill_orig(mill):
    '''compare true entries with output of detection algorithm
    before removing weights if some missing weights and before weight checks'''
    df_mill_true = pd.read_csv('../Images/weight_receipt/Reference '+mill+'.csv')
    df_mill_algo = pd.read_csv('../Images/weight_receipt/Algo '+mill+'.csv', index_col=0)
    ids = list(df_mill_true['ID'].values)
    df_mill_algo = df_mill_algo[df_mill_algo['ID'].isin(ids)]

    df_mill_algo = df_mill_algo.drop(columns=['Exit time']).rename(columns={'Entry time':'Time'})
    df_mill_algo['Time'] = df_mill_algo['Time'].apply(lambda x: rm_sec(x))
    if mill=='srjnad':
        df_mill_algo = df_mill_algo.drop(columns=['Time'])
    else:
        df_mill_true = df_mill_true.drop(columns=['Exit time']).rename(columns={'Entry time':'Time'})
        df_mill_true['Time'] = df_mill_true['Time'].apply(lambda x: rm_sec(x))
    df_mill_true['Date'] = df_mill_true['Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%m/%d/%Y'))
    df_mill_true = df_mill_true.set_index('ID')
    df_mill_algo = df_mill_algo.set_index('ID')
    df_mill_algo = df_mill_algo.reindex(ids)
    columns_comp = list(df_mill_true.columns)[1:]
    return compare_algo_true(df_mill_algo, df_mill_true, columns_comp)

In [109]:
mills = ['arvena', 'bss', 'nhr','skip', 'srjnad']
for mill in mills:
    df_compare = compare_algo_true_mill_orig(mill)
    df_compare.to_csv('../Images/weight_receipt/Comparison '+mill+' before checks.csv')

In [110]:
mill = 'arvena'
compare_algo_true_mill_orig(mill)

Unnamed: 0_level_0,Percentage of found entries,Percentage of true entries,True entries among found entries
n=30 receipts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Date,63%,63%,100%
Time,53%,33%,62%
First weight,60%,17%,28%
Second weight,47%,20%,43%
Net 1,77%,20%,26%
Potongan,63%,57%,89%
Net 2,27%,20%,75%
Average accuracy,56%,33%,61%


In [111]:
mill = 'bss'
compare_algo_true_mill_orig(mill)

Unnamed: 0_level_0,Percentage of found entries,Percentage of true entries,True entries among found entries
n=18 receipts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Date,56%,50%,90%
Time,33%,11%,33%
First weight,100%,72%,72%
Second weight,61%,50%,82%
Net 1,89%,72%,81%
Potongan,100%,72%,72%
Net 2,89%,72%,81%
Average accuracy,75%,57%,73%


In [19]:
mill = 'nhr'
compare_algo_true_mill_orig(mill)

Unnamed: 0_level_0,Percentage of found entries,Percentage of true entries,True entries among found entries
n=30 receipts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Date,80%,77%,96%
Time,80%,73%,92%
First weight,97%,97%,100%
Second weight,97%,90%,93%
Net 1,63%,63%,100%
Potongan,57%,53%,94%
Net 2,57%,53%,94%
Average accuracy,76%,72%,96%


In [20]:
mill = 'skip'
compare_algo_true_mill_orig(mill)

Unnamed: 0_level_0,Percentage of found entries,Percentage of true entries,True entries among found entries
n=30 receipts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Date,93%,93%,100%
Time,80%,57%,71%
First weight,93%,83%,89%
Second weight,90%,67%,74%
Net 1,87%,70%,81%
Potongan,83%,70%,84%
Net 2,60%,53%,89%
Average accuracy,84%,70%,84%


In [21]:
mill = 'srjnad'
compare_algo_true_mill_orig(mill)

Unnamed: 0_level_0,Percentage of found entries,Percentage of true entries,True entries among found entries
n=30 receipts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Date,47%,43%,93%
First weight,97%,83%,86%
Second weight,97%,97%,100%
Net 1,97%,93%,97%
Potongan,70%,70%,100%
Net 2,97%,83%,86%
Average accuracy,84%,78%,94%


### Compare result of algorithm with ground truth (after removing weights when all weights are not found and checking the 5 equations for the weights)

In [97]:
def compare_algo_true_mill_post_checks(mill, df_valid):
    '''compare dataframe built using the image recognition algorithm after weight checks
    and true dataframe entered manually'''
    df_mill_algo = df_valid[df_valid['Mill']==mill].dropna(subset=['First weight'])
    df_mill_true = pd.read_csv('../Images/weight_receipt/Reference '+mill+'.csv')
    df_mill_true = df_mill_true.drop_duplicates()
    ids_df_mill_algo = list(df_mill_algo['ID'].values)
    ids_df_mill_true = list(df_mill_true['ID'].values)
    ids = [ID for ID in ids_df_mill_algo if ID in ids_df_mill_true]
    df_mill_algo = df_mill_algo[df_mill_algo['ID'].isin(ids)]
    df_mill_true = df_mill_true[df_mill_true['ID'].isin(ids)]
        
    if mill=='srjnad':
        df_mill_algo = df_mill_algo.drop(columns=['Time'])
    else:
        df_mill_true = df_mill_true.drop(columns=['Exit time']).rename(columns={'Entry time':'Time'})
        df_mill_true['Time'] = df_mill_true['Time'].apply(lambda x: rm_sec(x))
    df_mill_true['Date'] = df_mill_true['Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%m/%d/%Y'))
    
    df_mill_true = df_mill_true.set_index('ID')
    df_mill_algo = df_mill_algo.set_index('ID')
    df_mill_algo = df_mill_algo.sort_index()
    df_mill_true = df_mill_true.sort_index()
    
    columns_comp = list(df_mill_true.columns)[2:]
    return compare_algo_true(df_mill_algo, df_mill_true, columns_comp)

In [69]:
df_valid = pd.read_csv('../Images/weight_receipt/Algo all mills - valid images with weight checks.csv')

In [23]:
df_valid

Unnamed: 0,Mill,ID,User (middle man),Date,Time,First weight,Second weight,Net 1,Potongan,Net 2,Date created,Time created
0,nhr,9,dwisuyanto,,,,,,,,12/05/2018,15:37
1,srjnad,14,madyani,12/04/2018,,11820.0,3910.0,7910.0,280.0,7630.0,12/05/2018,19:56
2,srjnad,16,madyani,12/05/2018,,,,,,,12/05/2018,20:00
3,nhr,20,dwisuyanto,12/06/2018,15:09,11430.0,4320.0,7110.0,356.0,6754.0,12/06/2018,15:35
4,skip,23,aguswibowo,,,,,,,,12/07/2018,15:27
...,...,...,...,...,...,...,...,...,...,...,...,...
2519,arvena,5719,muhammadfahrirambe,09/07/2019,10:48,,,,,,09/07/2019,12:56
2520,srjnad,5721,sudiwarnopandiangan,,,,,,,,09/07/2019,15:23
2521,skip,5723,sumaji,09/07/2019,19:09,10290.0,3660.0,6630.0,332.0,6298.0,09/07/2019,15:58
2522,skip,5728,madyani,09/07/2019,16:29,12200.0,3910.0,8290.0,415.0,7875.0,09/07/2019,19:09


In [83]:
mill = 'arvena'
compare_algo_true_mill_post_checks(mill, df_valid)

Unnamed: 0_level_0,Percentage of found entries,Percentage of true entries,True entries among found entries
n=7 receipts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Time,100%,86%,86%
First weight,100%,100%,100%
Second weight,100%,100%,100%
Net 1,100%,100%,100%
Potongan,100%,100%,100%
Net 2,100%,100%,100%
Average accuracy,100%,98%,98%


In [112]:
mill = 'bss'
compare_algo_true_mill_post_checks(mill, df_valid)

Unnamed: 0_level_0,Percentage of found entries,Percentage of true entries,True entries among found entries
n=8 receipts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Time,38%,12%,33%
First weight,100%,100%,100%
Second weight,100%,100%,100%
Net 1,100%,100%,100%
Potongan,100%,100%,100%
Net 2,100%,100%,100%
Average accuracy,90%,85%,89%


In [85]:
mill = 'nhr'
compare_algo_true_mill_post_checks(mill, df_valid)

Unnamed: 0_level_0,Percentage of found entries,Percentage of true entries,True entries among found entries
n=14 receipts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Time,100%,100%,100%
First weight,100%,100%,100%
Second weight,100%,100%,100%
Net 1,100%,100%,100%
Potongan,100%,100%,100%
Net 2,100%,100%,100%
Average accuracy,100%,100%,100%


In [86]:
mill = 'skip'
compare_algo_true_mill_post_checks(mill, df_valid)

Unnamed: 0_level_0,Percentage of found entries,Percentage of true entries,True entries among found entries
n=13 receipts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Time,100%,85%,85%
First weight,100%,100%,100%
Second weight,100%,100%,100%
Net 1,100%,100%,100%
Potongan,100%,100%,100%
Net 2,100%,100%,100%
Average accuracy,100%,97%,97%


In [102]:
mill = 'srjnad'
compare_algo_true_mill_post_checks(mill, df_valid)

Unnamed: 0_level_0,Percentage of found entries,Percentage of true entries,True entries among found entries
n=18 receipts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First weight,100%,100%,100%
Second weight,100%,100%,100%
Net 1,100%,100%,100%
Potongan,100%,100%,100%
Net 2,100%,100%,100%
Average accuracy,100%,100%,100%
