In [60]:
from config import *
import cv2
import pytesseract
import re
from nltk.corpus import stopwords
import requests
import pandas as pd


"""
Steps:
1. user uploads photo of ingredients ✓        
2. convert photo to text ✓
3. convert text to list ✓
4. call Cosmily API for EWG analysis ✓
5. return ingredients report to user

Error Handling:
- user input is a jpg, jpeg, or png file.
- Blurry Image
- Ingredients are not found in Cosmily database
- Image is not of ingredients or does not have any text in it

If extra time:
- scrape a website for a product's ingredients
- add dictionary instead of hardcoded phrasing
"""

"\nSteps:\n1. user uploads photo of ingredients ✓        \n2. convert photo to text ✓\n3. convert text to list ✓\n4. call Cosmily API for EWG analysis ✓\n5. return ingredients report to user\n\nError Handling:\n- user input is a jpg, jpeg, or png file.\n- Blurry Image\n- Ingredients are not found in Cosmily database\n- Image is not of ingredients or does not have any text in it\n\nIf extra time:\n- scrape a website for a product's ingredients\n- add dictionary instead of hardcoded phrasing\n"

In [61]:
image_path = './images/sensodyne.png'

In [62]:
def convert_image_to_text(path):
    # open image and convert to rgb
    img = cv2.imread(path)
    b,g,r = cv2.split(img)
    img_rgb = cv2.merge([r,g,b])

    # convert image to string
    return re.findall(r'[^.,:/\n]+', pytesseract.image_to_string(img_rgb))      # custom_config = r'--oem 3 --psm 6' --> add config param if words are too blurry

text = convert_image_to_text(image_path)

In [63]:
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    ingredients_list = []

    # remove stop words, the word "ingredients", and leading spaces
    for item in text:
        item = item.lstrip()
        if "ingredients" not in item.lower() and item not in stop_words:
            ingredients_list.append(item)

    ingredients = ', '.join(ingredients_list)
    return ingredients

ingredients = preprocess(text)

In [64]:
def analyze(ingredients):
    # get analysis of ingredient list from cosmily database api
    api_url = 'https://api.cosmily.com/api/v1/analyze/ingredient_list'
    headers =  {"Content-Type":"application/json", "Authorization": AUTH_TOKEN}
    params = {"ingredients": ingredients}
    response = requests.post(api_url, params=params, headers=headers)
    return response.json()['analysis']

analysis = analyze(ingredients)

In [65]:
def report(analysis):
    # description = f'This product contains {analysis["total_ingredients"]} ingredients.'
    # print(analysis['description'])

    # # TODO: EWG stats
    # self.analysis["ewg"]

    # print("DETRIMENTS")
    # # TODO: loop through and list negatives and harmful
    # self.analysis["negatives"]
    # 
    print((analysis))
    

    # print("BENEFITS")
    # # TODO: loop through and list positives and notables
    # self.analysis["positives"]
    # self.analysis["notable"]

    # # TODO: loop though each ingredient in ingredients_table and provide breakdown
    # print("\nWould you like to view the full ingredients data breakdown? (Y/N)")
    # show_breakdown = input

    # if show_breakdown.lower() == 'y':
    #     print(pd.DataFrame(self.analysis["ingredients_table"]))
    
    # # TODO: End report

report(analysis.keys())

dict_keys(['total_ingredients', 'ingredients_table', 'harmful', 'positive', 'notable', 'ewg', 'natural', 'text', 'description', 'share_token', 'ingredient_group', 'owner_user', 'permissions'])


<b>Description of main findings</b>


In [66]:
print(analysis['description'])

The product contains 14 ingredients. INCI: Aqua, Hydrated Silica, Sorbitol, and more. We have found polyethylene glycol (PEG), synthetic fragrances.


<b>Ingredient break down

In [67]:
ingredients_dictlist = []
ingdf = pd.DataFrame(columns=['Name','Alias','Decision'])

ings = []
ali = []
decis = []
for ing in analysis['ingredients_table']:
    ingredients_dictlist.append(ing)
    ings.append(ing['title'])
    ali.append(ing['alias'])
    
    try:
        ing['ewg']['decision']
    except:
        decis.append('N/A')
    else:
        decis.append(ing['ewg']['decision'])


ingdf['Name'] = ings
ingdf['Alias'] = ali
ingdf['Decision'] = decis


ingdf

Unnamed: 0,Name,Decision
0,Aqua,Safe
1,Hydrated Silica,Safe
2,Sorbitol,Safe
3,Potassium Nitrate,Safe
4,Glycerin,Safe
5,PEG-6,Safe - Moderate hazard
6,Sodium Lauryl Sulphate,
7,Aroma,High hazard
8,Titanium Dioxide,Safe - Moderate hazard
9,Xanthan Gum,Safe


<b>Beneficial aspects</b>

In [68]:
df = pd.DataFrame(columns=['Positive','Description','Ingredients'])

pos = []
desc = []
ings = []
for positive in analysis['positive']:
    pos.append(analysis['positive'][positive]['title'])
    desc.append(analysis['positive'][positive]['description'])
    ings.append(analysis['positive'][positive]['list'][0]['title'])

df['Positive'] = pos
df['Description'] = desc
df['Ingredients'] = ings


df

Unnamed: 0,Positive,Description,Ingredients
0,Moisturising,Hydrates the skin,Sorbitol
1,Softening,Improves texture,Sorbitol
2,Soothing,Reduces irritation,Sorbitol
3,Antiseptic,Kills bacteria,Xanthan Gum
4,UV Protection,Blocks sun damage,Titanium Dioxide


<b>Harmful ingredients</b>

In [78]:
harmdf = pd.DataFrame(columns=['Name','Harmful Ingredient','Description'])
name = []
desc = []
ings = []
for h in analysis['harmful']:
    if len(analysis['harmful'][h]['list']) > 0:
        for i in analysis['harmful'][h]['list']:
            name.append(analysis['harmful'][h]['title'])
            desc.append(analysis['harmful'][h]['description'])
            ings.append(i['title'])

harmdf['Name'] = name
harmdf['Harmful Ingredient'] = ings
harmdf['Description'] = desc

harmdf

Unnamed: 0,Name,Harmful Ingredient,Description
0,PEG,PEG-6,Polyethylene glycol
1,Fragrance,Aroma,Complex chemicals
