In [26]:
import boto3 
import botocore 
from sagemaker import get_execution_role 

import PIL
from PIL import Image
import s3fs

import pandas as pd
import numpy as np

from io import BytesIO

import re 
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


import chars2vec

In [2]:
role = get_execution_role() 
data_location = 's3://{}/{}'.format('sagemaker-060720', 'true_safety.csv') 
true_safety = pd.read_csv(data_location)

In [3]:
true_safety['allergens'] = "na"

In [4]:
isaac = 'milk, cheese, soy, cream, eggs'
jj = 'peanuts, whey, tree nuts'
chelsea = 'Shrimp, Prawns, Lobster, Crab'
emma = 'wheat, barely, rye triticale'
silvia = 'garlic, avocado, celery'

In [5]:
true_safety.loc[0:10, 'allergens'] = emma
true_safety.loc[10:21, 'allergens'] = silvia
true_safety.loc[21:45, 'allergens'] = isaac
true_safety.loc[45:55, 'allergens'] = jj
true_safety.loc[55:65, 'allergens'] = chelsea

In [6]:
true_safety.head()

Unnamed: 0,Item,Name,Label,allergens
0,0,emma-1.jpg,0,"wheat, barely, rye triticale"
1,1,emma-2.jpg,0,"wheat, barely, rye triticale"
2,2,emma-3.jpg,0,"wheat, barely, rye triticale"
3,3,emma-4.jpg,0,"wheat, barely, rye triticale"
4,4,emma-5.jpg,1,"wheat, barely, rye triticale"


In [7]:
# clean words 
def clean_word (word):
    
    c_word = word.lower().strip() # lowercase and remove white space
    c_word = re.sub('[^a-zA-Z]+', '', c_word) # remove anything that's not a letter
    if len(c_word) < 2: # remove words that are less than 2 characters
        c_word = "" 
    
    return c_word

# clean string of words
def clean_text (text, split=True):
    
    if split == False: # for ocr output
        c_text = [clean_word(w) for w in text] # already split and clean words
        
    else: 
        c_text = re.sub('[0-9]', ' ', text) # replace numbers with space 
        c_text = re.sub('['+string.punctuation+']', ' ', c_text) # replace punctuation with space
        c_text = [clean_word(w) for w in c_text.split()] # split on spaces and clean words
      
    c_text = sorted(list(filter(None, set(c_text)))) # remove empty words and get unique values and sort
    
    return c_text

# safety of product
def safety (allergens, ingredients):
    vectorizer = CountVectorizer(analyzer='char')
    if len(ingredients) == 0 or ingredients == ['']:
        return { "ocr": ingredients,
                 "target": allergens,
                 "matchy": "Nothing detected.",
                 "result": "Nothing detected. Please retake photo." }
    else:
        words = allergens + ingredients # create list of allergens and ingredients 
        word_embeddings = vectorizer.fit_transform(words) # embeddings for allergens and ingredients 
        cos_sims = cosine_similarity(word_embeddings[:len(allergens)], word_embeddings[len(allergens):]) # cos sim of allergens and ingredients 
        counts = [sum(all_sims) for all_sims in cos_sims > 0.85] # counts of ingredients with a 0.75 greater cos sim to each allergen 
        b = [c!=0 for c in counts] # boolean of counts
        matches = np.array(allergens)[b] # dangerous ingredients
        if sum(counts) > 0:
            return { "ocr": ingredients,
                     "target": allergens,
                     "matchy": list(matches),
                     "result": "unsafe" }
        else: return { "ocr": ingredients,
                       "target": allergens,
                       "matchy": "None of the ingredients match your allergens.",
                       "result": "safe" }

In [32]:
def safety_chars (allergens, ingredients):
    if len(ingredients) == 0 or ingredients == ['']:
        return "Nothing detected. Please retake photo."
    else:
        c2v_model = chars2vec.load_model('eng_50')
        words = allergens + ingredients # create list of allergens and ingredients 
        word_embeddings = c2v_model.vectorize_words(words) # embeddings for allergens and ingredients 
        cos_sims = cosine_similarity(word_embeddings[:len(allergens)], word_embeddings[len(allergens):]) # cos sim of allergens and ingredients 
        counts = [sum(all_sims) for all_sims in cos_sims > 0.75] # counts of ingredients with a 'sim' greater cos sim to each allergen 
        if sum(counts) > 0:
            return "unsafe"
        else: return "safe"
        
def safety_count (allergens, ingredients):
    if len(ingredients) == 0 or ingredients == ['']:
        return "Nothing detected. Please retake photo." 
    else:
        vectorizer = CountVectorizer(analyzer='char')
        words = allergens + ingredients # create list of allergens and ingredients 
        word_embeddings = vectorizer.fit_transform(words) # embeddings for allergens and ingredients 
        cos_sims = cosine_similarity(word_embeddings[:len(allergens)], word_embeddings[len(allergens):]) # cos sim of allergens and ingredients 
        counts = [sum(all_sims) for all_sims in cos_sims > 0.75] # counts of ingredients with a 'sim' greater cos sim to each allergen 
        if sum(counts) > 0:
            return "unsafe" 
        else: return "safe" 

In [34]:
results = []

tp = 0
fp = 0
tn = 0
fn = 0

fs = s3fs.S3FileSystem()
for img_path in fs.ls('s3://sagemaker-060720/Images/'):
    
    file_name = img_path[24:]
    true_vals = true_safety.loc[true_safety.Name == file_name,]
    label = true_vals.Label.values[0]
    allergens = true_vals.allergens.values[0]
    
    with fs.open(f's3://'+img_path) as f:
        
        pil_img = Image.open(f)
        buff = BytesIO()
        pil_img.save(buff, format="JPEG")
        img_bytes = buff.getvalue()
        
        tex_client = boto3.client('textract')
        tex_dect = tex_client.detect_document_text(Document={"Bytes":img_bytes})
        
        tex_text = [text['Text'] if text['BlockType']=='WORD' else "" for text in tex_dect['Blocks']]
        
        tex_ingredients = clean_text(tex_text, split=False)
        
        clean_allergens = clean_text(allergens)
        
#         tex_results = safety(clean_allergens, tex_ingredients)

#         results.append(tex_results)
        
#         tex_safety = tex_results['result']

        tex_safety = safety_count(clean_allergens, tex_ingredients)
        
        if (tex_safety=="safe") & (label==1):
            tp += 1
        if (tex_safety=="safe") & (label==0):
            fp += 1
        if (tex_safety=="unsafe") & (label==0):
            tn += 1
        if (tex_safety=="unsafe") & (label==1):
            fn += 1

print(tp, fp, tn, fn)

12 1 25 26


In [35]:
p = tp/(tp+fp)
r = tp/(tp+fn)
if p==0 and r==0:
    f1score = 0
else: f1score = (2*p*r)/(p+r)

In [36]:
f1score

0.47058823529411764

In [37]:
r

0.3157894736842105

In [38]:
fp/(fp+tn)

0.038461538461538464

In [None]:
28 2 24 10 | 30 2 24 8

0.8235294117647058 | 0.8571428571428572

0.7368421052631579 | 0.7894736842105263

0.07692307692307693 | 0.07692307692307693


12 1 25 26 | 