# OCR and NLP Model Evaluation 
In this notebook, we will evaluate six models for the OCR and NLP components of Gredient. We will use data from the public dataset [Open Food Facts](https://world.openfoodfacts.org/data), that contains images and annotations of product ingredients.

ANNOTATE AND CHECK FOR CONSISTENCY AND CODE MAKES SENSE -- VERIFY THAT SCORES AND GRAPHS ARE ACCURATE/ WRITE CODE TO SIMPLIFY REDUNDANCY

### Import some libraries...

In [1]:
# import general libraries
import pandas as pd
import numpy as np
from io import BytesIO
import string
import time
import re
import random
import base64
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon

In [2]:
# import libraries for connecting to S3
import boto3 
import botocore 
from sagemaker import get_execution_role 

In [4]:
# import libraries for preprocessing
import urllib
import cv2

ModuleNotFoundError: No module named 'cv2'

In [None]:
# import libraries for OCR 
import pytesseract
from PIL import Image
from pytesseract import Output

In [None]:
# import libraries for postprocessing
import chars2vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Connect to S3 and read data...

In [None]:
# connect to OFF data in S3
role = get_execution_role() 
bucket = 'sagemaker-060720' 
data_key = 'evalOFFdata.csv' 
data_location = 's3://{}/{}'.format(bucket, data_key) 

In [None]:
# load OFF data
eval_data = pd.read_csv(data_location)
print(eval_data.shape)

### Preprocess for OCR...

In [None]:
# sample N data for evaluation
n=1000
data = eval_data.sample(n, random_state=210).reset_index()
print(data.shape)
data.head()

In [None]:
# function to convert url to images
def url_to_image(url):
    resp = urllib.request.urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype='uint8')
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    return image

In [None]:
# convert image urls to np arrays with RGB
imgs = [url_to_image(image_url) for image_url in data.image_ingredients_url]
len(imgs)

Note: In previous iterations, we implemented and evaluated five image preprocessing methods that included combinations of grayscaling, denoising, thresholding, dilating, eroding, opening, deskewing, and canny edge detection. We then evaluated the performance of the OCR models with these preprocessed images. The performance decreased significantly, so we decided that the preprocessing methods built into the models were sufficient.

# OCR Evaluation

In this section, we will evaluate Pytesseract, Amazon Rekognition, and Amazon Textract on their ability to correctly detect ingredients from an image. We use the OFF data to first run each model on the whole sample of data, which includes both high and low quality images. At that point, performance is fairly low over all models. So to proxy performance on high quality images, we get the top 210 scoring detections from each model and run all three models on each set of top 210 images. We average the 630 F1 scores for each model to get our final accuracy metric used for evaluation. Additionally, we record the time in seconds for each model to run and include speed in our evaluation as well. Since the performance of detections is highly dependent on image quality, we encourage our users through the interface to take better quality photos by providing them a cropping mechanism and messages indicating what a good and bad quality image looks like. 


### Helpful Functions

In [None]:
# clean word tokens
def clean_word(word):
    
    c_word = word.lower().strip() # lowercase and remove white space
    c_word = re.sub('[^a-zA-Z]+', '', c_word) # remove anything that's not a letter
    if len(c_word) < 2: # remove words that are less than 2 characters
        c_word = "" 
    
    return c_word

# clean list of strings 
def clean_text(text, split=True):
    
    if split == False: # for ocr output
        c_text = [clean_word(w) for w in text] # already split and clean words
        
    else: 
        c_text = re.sub('[0-9]', ' ', text) # replace numbers with space 
        c_text = re.sub('['+string.punctuation+']', ' ', c_text) # replace punctuation with space
        c_text = [clean_word(w) for w in c_text.split()] # split on spaces and clean words
      
    c_text = sorted(list(filter(None, set(c_text)))) # remove empty words and get unique values and sort
    
    return c_text

In [None]:
# precision: #(detected words that are in ingredients) / #(all detected words)
def precision(ing_lst, i, ingredients):
    detected_words = ing_lst[i]
    actual_ingredients = ingredients[i]
    if len(detected_words) == 0:
        return 0 # 0 if no detected words 
    else:
        tp = sum([dw in actual_ingredients for dw in detected_words])
        p = len(detected_words) # tp+fp (positives)
        return tp/p
    
    
# recall: #(detected words that are in ingredients) / #(all words in ingredients)
def recall(ing_lst, i, ingredients):
    detected_words = ing_lst[i]
    actual_ingredients = ingredients[i]
    if len(detected_words) == 0:
        return 0 # 0 if no detected words 
    else:
        tp = sum([dw in actual_ingredients for dw in detected_words])
        a = len(actual_ingredients) # tp+fn (actual)
        return tp/a
    

# f1 score: 2*precision*recall / precision+recall
def F1score(ing_lst, i, ingredients):
    p = precision(ing_lst, i, ingredients)
    r = recall(ing_lst, i, ingredients)
    if p==0 and r==0:
        return 0
    return (2*p*r)/(p+r)

In [None]:
# get top images, ingredients, and F1 score 
def top210(scores):
    
    s = np.array([scores])
    inds = (-s).argsort()[0][:210] # top 210 detections

    imgs210 = [imgs[i] for i in list(inds)] # images for top 210 detections 
    print(len(imgs210))

    a_ings = [ingredients[i] for i in list(inds)] # actual ingredients for top 210 detections 
    print(len(a_ings))

    print("Average top F1-score:", sum([scores[i] for i in list(inds)])/210) # average F1 score for top 210 detections 
    
    return [imgs210, a_ings]

### Cleaned List of Actual Ingredients

In [None]:
ingredients = [clean_text(ing) for ing in data.ingredients_text] # words in ingredients 
len(ingredients)

## Pytesseract

In [None]:
# run pytesseract on an image
def pytess(img) :
    custom_oem_psm_config = r'--dpi 300 --psm 6'
    box = pytesseract.image_to_data(img, output_type=Output.DICT, lang='eng', config=custom_oem_psm_config)
    return box['text']

**--skip if already run**

In [None]:
# apply pytesseract to images and return time
start_time = time.time()
pyt_texts = [pytess(img) for img in imgs]
print("--- %s seconds ---" % (int(time.time() - start_time)/n)) # 1.014

In [None]:
# detected words
pyt_ingredients = [clean_text(text, split=False) for text in pyt_texts]
len(pyt_ingredients)

In [None]:
# save output of tesseract
pyt_output = pd.DataFrame({'detected':pyt_ingredients})
pyt_output.to_csv('pyt_output.csv', index=False)

**--**

In [None]:
# load saved data in csv
pyt_data = pd.read_csv('pyt_output.csv')

# get detected ingredients
pyt_ingredients = [[re.sub('[^a-zA-Z]+', '', e) for e in l.split(",")] for l in pyt_data.detected]

# peak at detected ingredients
print(pyt_ingredients[:5])
print(len(pyt_ingredients))

# get F1 scores for pytesseract detections
pyt_scores = [F1score(pyt_ingredients, i, ingredients) for i in range(n)]

print("Average F1-score:", sum(pyt_scores)/n)

In [None]:
# top 210 images and actual ingredients
pyt_imgs, a_pyt_ings = top210(pyt_scores)

## Rekognition

In [None]:
# instantiate rekognition object
rek_client=boto3.client('rekognition')

In [None]:
# run rekognition on an image
def rekogn(img):
    pil_img = Image.fromarray(img)
    buff = BytesIO()
    pil_img.save(buff, format="JPEG")
    img_bytes = buff.getvalue()
    rek_text = rek_client.detect_text(Image={"Bytes":img_bytes})
    return rek_text

**--skip if already run**

In [None]:
# apply rekognition to images and return time
start_time = time.time()
rek_texts = [rekogn(img) for img in imgs]
print("--- %s seconds ---" % (int(time.time() - start_time)/n)) # 4.67

In [None]:
# detected words
rek_words = [[text['DetectedText'] if text['Type']=='WORD' else "" for text in texts['TextDetections']] for texts in rek_texts]

rek_ingredients = [clean_text(text, split=False) for text in rek_words] # detected words

len(rek_ingredients)

In [None]:
# save output of rekognition
rek_output = pd.DataFrame({'detected':rek_ingredients})
rek_output.to_csv('rek_output.csv', index=False)

**--**

In [None]:
# load saved data in csv
rek_data = pd.read_csv('rek_output.csv')

# get detected ingredients
rek_ingredients = [[re.sub('[^a-zA-Z]+', '', e) for e in l.split(",")] for l in rek_data.detected]

# peak at detected ingredients
print(rek_ingredients[:5])
print(len(rek_ingredients))

# get F1 scores for rekognition detections
rek_scores = [F1score(rek_ingredients, i, ingredients) for i in range(n)]

print("Average F1-score:", sum(rek_scores)/n)

In [None]:
# top 210 images and actual ingredients
rek_imgs, a_rek_ings = top210(rek_scores)

## Textract

In [None]:
# instantiate textract object
tex_client = boto3.client('textract')

In [None]:
# run textract on an image
def textract(img):
    pil_img = Image.fromarray(img)
    buff = BytesIO()
    pil_img.save(buff, format="JPEG")
    img_bytes = buff.getvalue()
    tex_text = tex_client.detect_document_text(Document={"Bytes":img_bytes})
    return tex_text

**--skip if already run**

In [None]:
# apply textract to images and return time
start_time = time.time()
tex_texts = [textract(img) for img in imgs]
print("--- %s seconds ---" % (int(time.time() - start_time)/n)) # 1.354

In [None]:
# detected words
tex_words = [[text['Text'] if text['BlockType']=='WORD' else "" for text in texts['Blocks']] for texts in tex_texts]

tex_ingredients = [clean_text(text, split=False) for text in tex_words] # detected words

len(tex_ingredients)

In [None]:
# save output of textract
tex_output = pd.DataFrame({'detected':tex_ingredients})
tex_output.to_csv('tex_output.csv', index=False)

**--**

In [None]:
# load saved data in csv
tex_data = pd.read_csv('tex_output.csv')

# get detected ingredients
tex_ingredients = [[re.sub('[^a-zA-Z]+', '', e) for e in l.split(",")] for l in tex_data.detected]

# peak at detected ingredients
print(tex_ingredients[:5])
print(len(tex_ingredients))

# get F1 scores for textract detections
tex_scores = [F1score(tex_ingredients, i, ingredients) for i in range(n)]

print("Average F1-score:", sum(tex_scores)/n)

In [None]:
# top 210 images and actual ingredients
tex_imgs, a_tex_ings = top210(tex_scores)

## High Quality Images

### Helpful Functions

In [None]:
# run each model on top images
def run_all(imgs, n):
    
    # apply pytesseract to images and return time
    start_time = time.time()
    p_texts = [pytess(img) for img in imgs]
    print("--- pyt %s seconds ---" % (int(time.time() - start_time)/n))
    p_ingredients = [clean_text(text, split=False) for text in p_texts]
    print(len(p_ingredients))
    
    # apply rekognition to images and return time
    start_time = time.time()
    r_texts = [rekogn(img) for img in imgs]
    print("--- rek %s seconds ---" % (int(time.time() - start_time)/n)) 
    r_words = [[text['DetectedText'] if text['Type']=='WORD' else "" for text in texts['TextDetections']] for texts in r_texts]
    r_ingredients = [clean_text(text, split=False) for text in r_words] # detected words
    print(len(r_ingredients))
    
    # apply rekognition to images and return time
    start_time = time.time()
    t_texts = [textract(img) for img in imgs]
    print("--- tex %s seconds ---" % (int(time.time() - start_time)/n))
    t_words = [[text['Text'] if text['BlockType']=='WORD' else "" for text in texts['Blocks']] for texts in t_texts]
    t_ingredients = [clean_text(text, split=False) for text in t_words] # detected words
    print(len(t_ingredients))
    
    return [p_ingredients, r_ingredients, t_ingredients]

In [None]:
# get top scores for each model on a set of top detections
def top_scores(top_detections, top_ingredients):
    
    top_s = [[F1score(td, i, ti) for i in range(210)] for td,ti in zip(top_detections,[top_ingredients]*3)]

    print("Average F1-scores (pyt):", sum(top_s[0])/210)
    print("Average F1-scores (rek):", sum(top_s[1])/210)
    print("Average F1-scores (tex):", sum(top_s[2])/210)
    
    return top_s

### Scores for Top Pytesseract Images

In [None]:
top_pyt_ingredients = run_all(pyt_imgs, 210)

In [None]:
top_pyt_scores = top_scores(top_pyt_ingredients,a_pyt_ings)

### Scores for Top Rekognition Images

In [None]:
top_rek_ingredients = run_all(rek_imgs, 210)

In [None]:
top_rek_scores = top_scores(top_rek_ingredients,a_rek_ings)

### Scores for Top Textract Images

In [None]:
top_tex_ingredients = run_all(tex_imgs, 210)

In [None]:
top_tex_scores = top_scores(top_tex_ingredients,a_tex_ings)

### Average Scores for All Top Images

In [None]:
model = ['Pytesseract', 'Rekognition', 'Textract']

for i in range(3):
    print(model[i], (sum(top_pyt_scores[i]) + sum(top_rek_scores[i]) + sum(top_tex_scores[i])) / (3*210))

## Results
- Pytesseract has an average F1 score of 0.728 and speed of 1.01 seconds/image
    - 0.84881 for top 210 
    - 0.31855 for full sample
- Rekognition has an average F1 score of 0.878 and speed of 4.67 seconds/image  
    - 0.92443 for top 210 
    - 0.50114 for full sample 
- Textract has an average F1 score of 0.868 and speed of 1.35 seconds/image  
    - 0.91978 for top 210 
    - 0.49600 for full sample

# NLP Evaluation 

Since Textract does well on time and accuracy, we will move forward with evaluating NLP models on both the complete sample of Textract ingredient detections and the top scoring Textract detections. We have 5 sets of allergens that will be used to set the safety of each of the products. For each set, we will evaluate three models to see which one performs the best with respect to various metrics. Specifically, a true positive is when a model correctly signals a product to be safe, since there are no ingredient and allergen matches. On the other hand, a false positive is when a model signals a product to be safe when it is actually unsafe.   

The three models are (1) a perfect matching mechanism, (2) cosine similarity with CountVectorizer character embeddings, and (3) cosine similarity with Chars2Vec character embeddings. For the latter two, we tune the threshold of the cosine similarity scores to observe which setting would result in an acceptable balance between the true positive rate and the false positive rate. When the threshold is higher, the model makes fewer matches between allergens and ingredients, and thus has an increased false positive rate. Basic match represents the accuracy metrics at the highest threshold, where the cosine similarity score equals 1. We aim to find the model and the threshold that would strike the best balance.

### Helpful Functions

In [None]:
# detect the safety of ingredients for a given similarity score 
def dect_safety(safety_func, sim):  
    
    start_time = time.time()
    
    top_safety = [[safety_func(person, ing, sim) for ing in top_tex_ings] for person in allergens] # detected safety for top ingredients
    all_safety = [[safety_func(person, ing, sim) for ing in tex_ingredients] for person in allergens] # detected safety for all ingredients
    
    t_time = (time.time() - start_time)/(1210*5)
    print("--- %s seconds ---" % t_time)
    
    return [top_safety, all_safety, t_time]

In [None]:
# positive: safe ; negative: unsafe
# true positives: #(classified safe products that are actually safe) 
# false positives: #(classified safe products that are actually unsafe) <- NOT GOOD 
# true negatives: #(classified unsafe products that are actually unsafe)
# false negatives: #(classified unsafe products that are actually safe)

def pred_stats(pred_safety, true_safety):
    
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for i in range(len(pred_safety)):
        if (pred_safety[i]=="safe") & (true_safety[i]=="safe"):
            tp += 1
        if (pred_safety[i]=="safe") & (true_safety[i]=="unsafe"):
            fp += 1
        if (pred_safety[i]=="unsafe") & (true_safety[i]=="unsafe"):
            tn += 1
        if (pred_safety[i]=="unsafe") & (true_safety[i]=="safe"):
            fn += 1
            
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    if p==0 and r==0:
        f1score = 0
    else: f1score = (2*p*r)/(p+r)
        
    return(tp,fp,tn,fn,p,r,f1score)

### Allergens and True Safety Rating

In [None]:
person_A = ['milk', 'cheese', 'soy', 'cream', 'eggs']
person_B = ['peanuts', 'whey', 'tree nuts']
person_C = ['Shrimp', 'Prawns', 'Lobster', 'Crab']
person_D = ['wheat', 'barely', 'rye triticale']
person_E = ['garlic', 'avocado', 'celery']

allergens = [person_A, person_B, person_C, person_D, person_E]

In [None]:
# boolean of presence of allergens
top_bool_safety = [[[a in ing for a in person] for ing in a_tex_ings] for person in allergens] # for top ingredients 
bool_safety = [[[a in ing for a in person] for ing in ingredients] for person in allergens] # for all ingredients 

# true safety of each product
top_true_safety = [["unsafe" if sum(s)>0 else "safe" for s in person] for person in top_bool_safety]
true_safety = [["unsafe" if sum(s)>0 else "safe" for s in person] for person in bool_safety] 

print(len(top_true_safety[0]))
print(len(true_safety[0]))

In [None]:
# actual safety for persons 
print("top safe", [sum([s == "safe" for s in person]) for person in top_true_safety], "top unsafe", [sum([s == "unsafe" for s in person]) for person in top_true_safety])
print("all safe", [sum([s == "safe" for s in person]) for person in true_safety], "all unsafe", [sum([s == "unsafe" for s in person]) for person in true_safety])


In [None]:
top_tex_ings = top_tex_ingredients[2] # top detected ingredients
#tex_ingredients <- all detected ingredients 

## Basic Matching

In [None]:
# apply perfect matching
def safety_basic (allergens, ingredients): 
    if len(ingredients) == 0 or ingredients == ['']:
        return "Nothing detected. Please retake photo." 
    else: 
        ocr_safety = [a in ingredients for a in allergens] # creates boolean array of presence of allergen in ingredients
        if sum(ocr_safety) > 0:
            return "unsafe"
        else: return "safe"

In [None]:
start_time = time.time()

top_basic_safety = [[safety_basic(person, ing) for ing in top_tex_ings] for person in allergens] # detected safety for top ingredients for each person
basic_safety = [[safety_basic(person, ing) for ing in tex_ingredients] for person in allergens] # detected safety for all ingredients for each person

print("--- %s seconds ---" % ((time.time() - start_time)/(1210*5))) # 0.000003

In [None]:
print(len(top_basic_safety[0]))
print(len(basic_safety[0]))

In [None]:
top_basic_scores = [pred_stats(top_basic_safety[i], top_true_safety[i]) for i in range(5)] # scores for top ingredients for each person 
basic_scores = [pred_stats(basic_safety[i], true_safety[i]) for i in range(5)] # scores for all ingredients for each person 

print(len(top_basic_scores[0]))
print(len(basic_scores[0]))

## CountVectorizer

In [None]:
def safety_count (allergens, ingredients, sim):
    if len(ingredients) == 0 or ingredients == ['']:
        return "Nothing detected. Please retake photo." 
    else:
        vectorizer = CountVectorizer(analyzer='char')
        words = allergens + ingredients # create list of allergens and ingredients 
        word_embeddings = vectorizer.fit_transform(words) # embeddings for allergens and ingredients 
        cos_sims = cosine_similarity(word_embeddings[:len(allergens)], word_embeddings[len(allergens):]) # cos sim of allergens and ingredients 
        counts = [sum(all_sims) for all_sims in cos_sims > sim] # counts of ingredients with a 'sim' greater cos sim to each allergen 
        if sum(counts) > 0:
            return "unsafe" 
        else: return "safe" 

In [None]:
count_sims = [0.75, 0.80, 0.85, 0.90, 0.95]
count_safes = [dect_safety(safety_count, sim) for sim in count_sims]
top_count_safety = [count_s[0] for count_s in count_safes] # detected safety for top ingredients for every cosine threshold
count_safety = [count_s[1] for count_s in count_safes] # detected safety for all ingredients for every cosine threshold
print(sum([count_s[2] for count_s in count_safes])/5) # 0.00230

In [None]:
print(len(top_count_safety[0][0]))
print(len(count_safety[0][0]))

In [None]:
top_count_scores = [[pred_stats(top_count_safety[s][i], top_true_safety[i]) for i in range(5)] for s in range(5)] # scores for top ingredients for each cosine and each person
count_scores = [[pred_stats(count_safety[s][i], true_safety[i]) for i in range(5)] for s in range(5)] # scores for top ingredients for each cosine and each person

print(len(top_count_scores[0][0]))
print(len(count_scores[0][0]))

## Chars2Vec

In [None]:

def safety_chars (allergens, ingredients, sim):
    if len(ingredients) == 0 or ingredients == ['']:
        return "Nothing detected. Please retake photo."
    else:
        c2v_model = chars2vec.load_model('eng_50')
        words = allergens + ingredients # create list of allergens and ingredients 
        word_embeddings = c2v_model.vectorize_words(words) # embeddings for allergens and ingredients 
        cos_sims = cosine_similarity(word_embeddings[:len(allergens)], word_embeddings[len(allergens):]) # cos sim of allergens and ingredients 
        counts = [sum(all_sims) for all_sims in cos_sims > sim] # counts of ingredients with a 'sim' greater cos sim to each allergen 
        if sum(counts) > 0:
            return "unsafe"
        else: return "safe"

In [None]:
safety_chars[person_A, top_tex_ings, 0.85]

In [None]:
chars_sims = [0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95]
chars_safes = [dect_safety(safety_chars, sim) for sim in chars_sims]
top_chars_safety = [chars_s[0] for chars_s in chars_safes] # detected safety for top ingredients for every cosine threshold
chars_safety = [chars_s[1] for chars_s in chars_safes] # detected safety for top ingredients for every cosine threshold
print(sum([chars_s[2] for chars_s in chars_safes])/5) # 0.000956

In [None]:
print(len(top_chars_safety[0][0]))
print(len(chars_safety[0][0]))

In [None]:
top_chars_scores = [[pred_stats(top_chars_safety[s][i], top_true_safety[i]) for i in range(5)] for s in range(5)] # scores for top ingredients for each cosine and each person
chars_scores = [[pred_stats(chars_safety[s][i], true_safety[i]) for i in range(5)] for s in range(5)] # scores for top ingredients for each cosine and each person

print(len(top_chars_scores[0][0]))
print(len(chars_scores[0][0]))

## ROC and AUC 

For different thresholds of the cosine similarity score, we can draw an ROC curve to determine the best threshold for the tradeoff we are willing to accept. We can also decide to use CountVectorizer or Chars2Vec based on which one has a greater AUC.

In [None]:
# sensitivity = recall =  tp/(tp+fn) : true safe detections out of all safe
def tpr(tex_scores): 
    return [sum([scores[5] for scores in tex_scores[i]])/5 for i in range(5)]
  
# 1-specificity = false positive rate = fp/(fp+tn) : false safe detections out of all unsafe
def fpr(tex_scores):
    return [sum([scores[1]/(scores[1]+scores[2]) for scores in tex_scores[i] if (scores[1]+scores[2])!=0])/5 for i in range(5)]
    
# (0:tp, 1:fp, 2:tn, 3:fn, 4:p, 5:r, 6:f1score) 

In [None]:
plt.style.use('fivethirtyeight')
matplotlib.rcParams['font.family'] = "sans-serif"

def plotROC (tex_scores1, tex_scores2, tex_scores3):
    
    x1 = fpr(tex_scores1)
    y1 = tpr(tex_scores1)

    x2 = fpr(tex_scores2)
    y2 = tpr(tex_scores2)

    fig, ax = plt.subplots(figsize=(8,5))
    
    x3 = sum([scores[1]/(scores[1]+scores[2]) for scores in tex_scores3 if (scores[1]+scores[2])!=0])/5
    y3 = sum([scores[5] for scores in tex_scores3])/5

    plt.ylabel("True Positive Rate", color="#447b72", weight="bold")
    plt.yticks(fontsize=12, color="#234943")
    plt.xlabel("False Positive Rate", color="#447b72", weight="bold")
    plt.xticks(fontsize=12, color="#234943")
    #plt.title("All Images")

    plt.plot(x3, y3, 'Db', label='Basic Match', alpha=0.75)
    plt.plot(x1, y1, 'c', label='CountVectorizer')
    plt.plot(x2, y2, 'y', label='Chars2Vec')
    plt.plot(x3, y3, 'Db')

    legend = ax.legend(loc='upper left', edgecolor='#447b72', shadow=True, prop={'weight':'bold', 'size':10})
    frame = legend.get_frame()
    frame.set_facecolor("white")
    
    for text in legend.get_texts():
        text.set_color("#447b72")

    verts1 = [(x1[0], 0), *zip(x1, y1), (x1[4], 0)]
    poly1 = Polygon(verts1, facecolor='0.9', edgecolor='0.5', color="c", alpha=0.1)
    ax.add_patch(poly1)

    verts2 = [(x2[0], 0), *zip(x2, y2), (x2[4], 0)]
    poly2 = Polygon(verts2, facecolor='0.9', edgecolor='0.5', color="y", alpha=0.1)
    ax.add_patch(poly2)
    
    fig.set_facecolor('white')
    ax.set_facecolor("white")

#     for i, txt in enumerate(s):
#         ax.annotate(txt, (x1[i], y1[i]+0.02), fontsize=10)
#         ax.annotate(txt, (x2[i], y2[i]-0.02), fontsize=10)

    return plt

In [None]:
plt = plotROC(count_scores, chars_scores, basic_scores) # all ingredients 

plt.show()

In [None]:
plt = plotROC(top_count_scores, top_chars_scores, top_basic_scores) # top ingredients 

plt.show()