###  4 - Weights checks

In [62]:
import pandas as pd
import json
import os
import io
from shutil import copyfile
import re
import itertools
from google.cloud import vision
from datetime import datetime
import numpy as np
from google.protobuf.json_format import MessageToDict
pd.set_option('display.max_rows', 10)

### Replace all weights by NA's when at least one weight is missing

In [1]:
df_all_mills = pd.read_csv('../Images/weight_receipt/Algo all mills.csv')
weight_col_names = ['First weight', 'Second weight', 'Net 1', 'Potongan', 'Net 2']
other_col_names = ['Mill', 'ID', 'User (middle man)', 'Date', 'Time', 'Date created', 'Time created']

df_valid = df_all_mills.copy() # dataframe where all weights are replaced by NA's if all 5 weights are not found

weights_found_list = [0]*6

def convert_weights(row):
    '''replaces all weights by NAs if all five weights are not found'''
    print('\nID', row['ID'])
    weights = [row['First weight'], row['Second weight'], row['Net 1'], row['Potongan'], row['Net 2']]
    print(weights)
    weights_not_found = sum([1 if np.isnan(w) else 0 for w in weights])
    weights_found = 5 - weights_not_found
    weights_found_list[weights_found]+=1
    if weights_not_found>0:
        print(round(weights_not_found), 'weight(s) not found, all weights replaced by NAs.')
        return [np.nan]*5+[False]
    else:
        print('All weights were found.')
        return weights+[True]

df_valid[weight_col_names+['Five weights found']] = df_valid.apply(lambda row: pd.Series(convert_weights(row)), axis=1)

In [167]:
df_valid

Unnamed: 0,Mill,ID,User (middle man),Date,Time,First weight,Second weight,Net 1,Potongan,Net 2,Date created,Time created,Five weights found
0,nhr,9,dwisuyanto,,,,,,,,12/05/2018,15:37,False
1,srjnad,14,madyani,12/04/2018,,11820.0,3910.0,7910.0,280.0,7630.0,12/05/2018,19:56,True
2,srjnad,16,madyani,12/05/2018,,,,,,,12/05/2018,20:00,False
3,nhr,20,dwisuyanto,12/06/2018,15:09,11430.0,4320.0,7110.0,356.0,6754.0,12/06/2018,15:35,True
4,skip,23,aguswibowo,,,,,,,,12/07/2018,15:27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2519,arvena,5719,muhammadfahrirambe,09/07/2019,10:48,6070.0,2900.0,8970.0,520.0,0.0,09/07/2019,12:56,True
2520,srjnad,5721,sudiwarnopandiangan,,,,,,,,09/07/2019,15:23,False
2521,skip,5723,sumaji,09/07/2019,19:09,10290.0,3660.0,6630.0,332.0,6298.0,09/07/2019,15:58,True
2522,skip,5728,madyani,09/07/2019,16:29,12200.0,3910.0,8290.0,415.0,7875.0,09/07/2019,19:09,True


#### Summary of number of weights found

In [213]:
df_found_summary = pd.DataFrame(weights_found_list)
df_found_summary.columns=['Number of receipts']
df_found_summary.index = ['No weight found', 'One weight found', 'Two weights found',
                  'Three weights found', 'Four weights found', 'Five weights found']

tot_num_receipts = df_found_summary['Number of receipts'].sum()
df_found_summary.loc[len(df_found_summary)] = tot_num_receipts
df_found_summary.index = list(df_found_summary.index)[:-1] + ['Total']
df_found_summary['Percentage receipts'] = df_found_summary['Number of receipts']/tot_num_receipts
df_found_summary['Percentage receipts'] = df_found_summary['Percentage receipts'].apply(lambda x: "{0:.0f}%".format(x*100))
df_found_summary

Unnamed: 0,Number of receipts,Percentage receipts
No weight found,358,14%
One weight found,79,3%
Two weights found,268,11%
Three weights found,94,4%
Four weights found,294,12%
Five weights found,1432,57%
Total,2525,100%


### Check weights and try to reorder them, replace all weights by NA's if weights couldn't be checked
Check 1: 1st weight – 2nd weight = Net 1  
Check 2: Net 1 – Potongan = Net 2  
Check 3: Net 1 > Net 2  
Check 4: Potongan < .12 * Net 1

In [144]:
def all_perm_weights(weights):
    '''returns all permutations of a list'''
    all_perm = []
    for perm in itertools.permutations(weights):
        all_perm.append(list(perm))
    return all_perm

In [2]:
def reordered_weights(row):
    '''check if four equations are satisfied by trying all permutations of the found weights'''
    valid_reorder = False
    weights = list(row[weight_col_names].values)
    five_weights_found = row['Five weights found']
    if not np.isnan(weights[0]):
        ID = row['ID']
        print('\nID', ID)
        print('Checking permutations for', weights)
        weights_perms = all_perm_weights(weights)
        for i, weights_perm in enumerate(weights_perms):
            w1 = weights_perm[0]
            w2 = weights_perm[1]
            w3 = weights_perm[2]
            w4 = weights_perm[3]
            w5 = weights_perm[4]
            check_1 = w1 - w2 == w3
            check_2 = w3 - w4 == w5
            check_3 = w3 > w5
            check_4 = w4 < .12 * w3
            if check_1 and check_2 and check_3 and check_4:
                print('All checks satisfied at permutation', i)
                if i != 0:
                    valid_perm_non_zero_index.append(i)
                valid_reorder = True
                return weights_perm + [five_weights_found]
        if valid_reorder is False:
            print('Couldn\'t find a valid weight permutation. All weights will be replaced by NAs.')
            return [np.nan]*5 + [row['Five weights found']]
    return [np.nan]*5 + [row['Five weights found']]
valid_perm_non_zero_index = []
df_valid[weight_col_names+['Five weights found']] = df_valid.apply(lambda x: pd.Series(reordered_weights(x)), axis=1);

In [180]:
df_valid

Unnamed: 0,Mill,ID,User (middle man),Date,Time,First weight,Second weight,Net 1,Potongan,Net 2,Date created,Time created,Five weights found
0,nhr,9,dwisuyanto,,,,,,,,12/05/2018,15:37,False
1,srjnad,14,madyani,12/04/2018,,11820.0,3910.0,7910.0,280.0,7630.0,12/05/2018,19:56,True
2,srjnad,16,madyani,12/05/2018,,,,,,,12/05/2018,20:00,False
3,nhr,20,dwisuyanto,12/06/2018,15:09,11430.0,4320.0,7110.0,356.0,6754.0,12/06/2018,15:35,True
4,skip,23,aguswibowo,,,,,,,,12/07/2018,15:27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2519,arvena,5719,muhammadfahrirambe,09/07/2019,10:48,,,,,,09/07/2019,12:56,True
2520,srjnad,5721,sudiwarnopandiangan,,,,,,,,09/07/2019,15:23,False
2521,skip,5723,sumaji,09/07/2019,19:09,10290.0,3660.0,6630.0,332.0,6298.0,09/07/2019,15:58,True
2522,skip,5728,madyani,09/07/2019,16:29,12200.0,3910.0,8290.0,415.0,7875.0,09/07/2019,19:09,True


#### Number of receipts with valid weights only after reodering
Including 19 skip receipts  
Total improvement: close to 3%

In [215]:
len(valid_perm_non_zero_index)

35

In [187]:
df_valid

Unnamed: 0,Mill,ID,User (middle man),Date,Time,First weight,Second weight,Net 1,Potongan,Net 2,Date created,Time created,Five weights found
0,nhr,9,dwisuyanto,,,,,,,,12/05/2018,15:37,False
1,srjnad,14,madyani,12/04/2018,,11820.0,3910.0,7910.0,280.0,7630.0,12/05/2018,19:56,True
2,srjnad,16,madyani,12/05/2018,,,,,,,12/05/2018,20:00,False
3,nhr,20,dwisuyanto,12/06/2018,15:09,11430.0,4320.0,7110.0,356.0,6754.0,12/06/2018,15:35,True
4,skip,23,aguswibowo,,,,,,,,12/07/2018,15:27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2519,arvena,5719,muhammadfahrirambe,09/07/2019,10:48,,,,,,09/07/2019,12:56,True
2520,srjnad,5721,sudiwarnopandiangan,,,,,,,,09/07/2019,15:23,False
2521,skip,5723,sumaji,09/07/2019,19:09,10290.0,3660.0,6630.0,332.0,6298.0,09/07/2019,15:58,True
2522,skip,5728,madyani,09/07/2019,16:29,12200.0,3910.0,8290.0,415.0,7875.0,09/07/2019,19:09,True


#### Count number of receipts found and valid

In [206]:
df_count_receipts = pd.DataFrame()
df_count_receipts['All receipts'] = df_all_mills.groupby('Mill')['ID'].agg('count')
df_count_receipts['All weights found'] = df_valid[df_valid['Five weights found']==True].groupby('Mill')['ID'].agg('count')
df_count_receipts['All weights valid'] = df_valid[['Mill']+weight_col_names].dropna().groupby('Mill')['First weight'].agg('count')
df_count_receipts.loc[len(df_count_receipts)] = list(df_count_receipts.sum().values)
df_count_receipts.index = list(df_count_receipts.index[:-1]) + ['Total']
df_count_receipts
    
df_count_receipts['Percentage all weights found'] = df_count_receipts['All weights found']/df_count_receipts['All receipts']
df_count_receipts['Percentage all weights valid'] = df_count_receipts['All weights valid']/df_count_receipts['All receipts']
df_count_receipts['Percentage all weights valid among found'] = df_count_receipts['Percentage all weights valid']/df_count_receipts['Percentage all weights found']
df_count_receipts[['Percentage all weights found', 'Percentage all weights valid', 'Percentage all weights valid among found']] = df_count_receipts[['Percentage all weights found', 'Percentage all weights valid', 'Percentage all weights valid among found']].applymap(lambda x: "{0:.0f}%".format(x*100))

In [207]:
df_count_receipts

Unnamed: 0,All receipts,All weights found,All weights valid,Percentage all weights found,Percentage all weights valid,Percentage all weights valid among found
arvena,163,94,35,58%,21%,37%
bss,24,14,10,58%,42%,71%
nhr,383,81,76,21%,20%,94%
skip,1188,809,617,68%,52%,76%
srjnad,766,434,352,57%,46%,81%
Total,2524,1432,1090,57%,43%,76%


In [208]:
df_valid = df_valid.drop(columns=['Five weights found'])
df_valid.to_csv('../Images/weight_receipt/Algo all mills - valid images with weight checks.csv', index=False)

In [223]:
df_valid.sample(frac=1)

Unnamed: 0,Mill,ID,User (middle man),Date,Time,First weight,Second weight,Net 1,Potongan,Net 2,Date created,Time created
2154,skip,4757,tumiyem,08/16/2019,9:18,11740.0,3510.0,8230.0,412.0,7818.0,08/16/2019,11:30
2488,srjnad,5623,rolincsinaga,09/05/2019,,13180.0,4630.0,8550.0,470.0,8080.0,09/05/2019,15:15
140,nhr,281,rusdiono,,,,,,,,01/06/2019,20:36
876,arvena,1784,yusup,05/11/2019,13:5,11010.0,4090.0,6920.0,520.0,6400.0,05/11/2019,16:21
1573,skip,3393,aguswibowo,07/14/2019,,,,,,,07/14/2019,17:45
...,...,...,...,...,...,...,...,...,...,...,...,...
960,skip,1962,rusdiono,,14:30,10650.0,4100.0,6550.0,295.0,6255.0,05/20/2019,20:27
1147,skip,2366,madyani,06/17/2019,12:01,10930.0,3810.0,7120.0,320.0,6800.0,06/17/2019,18:17
853,arvena,1732,yusup,,,,,,,,05/08/2019,18:19
649,skip,1310,siswanto,04/11/2019,14:46,10010.0,4120.0,5890.0,265.0,5625.0,04/11/2019,18:21


### User perpective: check image validity by checking what percentage (tau) of values were found
If more than a percentage tau of the entries are missing, the user is asked to retake the picture

In [3]:
tau = .8
df_all_mills = pd.read_csv('../Images/weight_receipt/Algo all mills.csv')
col_names = ['Mill', 'ID', 'User (middle man)', 'Date', 'Time',
                 'First weight', 'Second weight', 'Net 1', 'Potongan', 'Net 2']
def image_is_valid_user(ID, tau=.7):
    df = df_all_mills[df_all_mills['ID']==ID][col_names]
    if len(df) == 0:
        print('Index '+str(ID)+' not found')
        return False
    if len(df) > 1:
        print('Warning:', len(df), 'duplicate rows with same ID', ID,'Duplicates were removed')
        df = pd.DataFrame(df.iloc[0]).T
    perc_found_entries = 1-df.isnull().sum().sum()/len(col_names)
    print(round(perc_found_entries*100), '% of entries found')
    if perc_found_entries >= tau:
        return True
    else:
        print('The image is blurry, cut or invalid, please retake the picture.')
        return False

for ID in df_all_mills['ID']:
    print('\nID', ID)
    df_all_mills.loc[df_all_mills['ID']==ID, 'Valid image'] = image_is_valid_user(ID, tau=tau)

#### Summary valid receipts user perspective

In [211]:
df_count_valid_user = pd.DataFrame()
df_count_valid_user['Total count'] = df_all_mills.groupby('Mill')['Valid image'].agg('count')
df_count_valid_user['Valid count'] = df_all_mills[df_all_mills['Valid image']==True].groupby('Mill')['Valid image'].agg('count')
df_count_valid_user.loc[len(df_count_valid_user)] = list(df_count_valid_user.sum().values)
df_count_valid_user.index = list(df_count_valid_user.index[:-1]) + ['Total']

df_count_valid_user['Percentage valid receipts'] = df_count_valid_user['Valid count']/df_count_valid_user['Total count']
df_count_valid_user['Percentage valid receipts'] = df_count_valid_user['Percentage valid receipts'].apply(lambda x: "{0:.0f}%".format(x*100))

In [212]:
df_count_valid_user

Unnamed: 0,Total count,Valid count,Percentage valid receipts
arvena,163,125,77%
bss,24,17,71%
nhr,383,96,25%
skip,1188,969,82%
srjnad,766,510,67%
Total,2524,1717,68%
