In [1]:
'''
import libraries and set print options
'''
import pandas as pd
import numpy as np
import json
import re
import traceback
pd.set_option('display.max_rows', 1000)

# Exploratory Data Analysis

In [2]:
data = pd.read_csv('../data/products.csv') ##read csv as a dataframe
data.head() ## show the top 5

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,NDB_Number,long_name,data_source,gtin_upc,manufacturer,date_modified,date_available,ingredients_english
0,45001524,MOCHI ICE CREAM BONBONS,LI,19022128593,"G. T. Japan, Inc.",2017-11-15 19:19:38,2017-11-15 19:19:38,"ICE CREAM INGREDIENTS: MILK, CREAM, SUGAR, STR..."
1,45001528,CHIPOTLE BARBECUE SAUCE,LI,5051379043735,FRESH & EASY,2018-04-26 17:23:31,2018-04-26 17:23:31,"WATER, SUGAR, TOMATO PASTE, MOLASSES, DISTILLE..."
2,45001529,HOT & SPICY BARBECUE SAUCE,LI,5051379009434,FRESH & EASY,2018-04-26 18:17:37,2018-04-26 18:17:37,"SUGAR, WATER, DISTILLED VINEGAR, TOMATO PASTE,..."
3,45001530,BARBECUE SAUCE,LI,5051379019969,FRESH & EASY,2018-04-26 17:24:00,2018-04-26 17:24:00,"TOMATO PUREE (WATER, TOMATO PASTE), SUGAR, DIS..."
4,45001531,BARBECUE SAUCE,LI,5051379009526,FRESH & EASY,2018-04-26 17:47:41,2018-04-26 17:47:41,"SUGAR, DISTILLED VINEGAR, WATER, TOMATO PASTE,..."


In [3]:
titles = data.long_name.tolist() ## get the long_name column into a list
len(titles) ## print the length

239089

In [4]:
possible_brands = [x.split(",")[0] for x in titles if len(x.split(",")) > 1] ## what is the 0th index for items with commas
possible_brands[:10]

['FRESH & EASY',
 'FRESH & EASY',
 'STATER BROS.',
 'STATER BROS.',
 'STATER BROS.',
 'STATER BROS.',
 'GREAT MIDWEST',
 'GREAT MIDWEST',
 'ICE CREAM',
 'FRESH & EASY']

In [5]:
possible_brands = [x for x in titles if len(x.split(",")) == 4]
print (len(possible_brands))
possible_brands[:10]

3213


['KROGER, CHIPMATES, COOKIES, CHOCOLATE CHIPS',
 "NEUMAN'S, CARROT PINEAPPLE BREAD, CARROT, PINEAPPLE",
 'SPECIALLY SELECTED, STONE BAKED PIZZA, MOZZARELLA,CHERRY TOMATO AND ARGULA',
 "ALBERTSON'S, ICE CREAM, ROCKY ROAD, CHOCOLATE ICE CREAM WITH MARSHMALLOW RIBBON AND MIXED NUTS",
 'NUMI, DECAFFEINATED SAVORY, GREEN TEA BAGS, FENNEL SPICE',
 'LANCE, TOAST CHEE, CRACKERS, PEANUT BUTTER',
 'WHITE GOLD, SUGAR, PURE CANE, EXTRA FINE GRANULATED',
 'HARRIS TEETER, FRESH FOODS MARKET, ARTISAN HUMMUS, CARAMELIZED ONIONS',
 'HARRIS TEETER, FRESH FOODS MARKET, KOSHER DILL SPEARS, HOT & SPICY',
 "SAM'S CHOICE, WAFFLE COOKIES, CINNAMON, HONEY"]

## RULES:
### - The zeroth index seems to be brand for the most part (few exceptions)
### - When there are 3 items, the first index is the main product with the 2nd index being a descriptor/flavor/type
### - When there are 4 items (only ~4k examples), the 2nd index is the product, the 0th index is still brand, the 3rd index is the descriptor

# Data Cleaning - Process Is Commented

In [6]:
brands = {x.split(",")[0] for x in titles if len(x.split(",")) > 2 and len(x.split(",")[0]) > 1}
with open('../data/brands.json', 'w') as outfile:
    json.dump(list(brands), outfile)

In [7]:
DROP_COLS = [
    'NDB_Number',
    'data_source',
    'gtin_upc',
    'manufacturer',
    'date_modified',
    'date_available',
    'ingredients_english'
]
data = data.drop(DROP_COLS, axis=1) ## drop unnecessary columns for now
data.head()

Unnamed: 0,long_name
0,MOCHI ICE CREAM BONBONS
1,CHIPOTLE BARBECUE SAUCE
2,HOT & SPICY BARBECUE SAUCE
3,BARBECUE SAUCE
4,BARBECUE SAUCE


In [8]:
'''
Find the number of commas in each title and only get those with less than 5
'''
data['num_commas'] = data.apply(lambda x: len(x.long_name.split(","))-1, axis=1)
data = data[data['num_commas'] < 4]

In [9]:
len(data)

238333

In [10]:
'''
This will keep brands and separate it from the rest of the titles with a ",". EX: "FRESH & EASY, CANOLA OIL"
'''
def remove_descriptor(x):
    x_ = x.split(",")
    if len(x_) == 2 or len(x_) == 3:
        if 'fl oz' in x_[1]:
            return x_[0].strip()
        else:
            return x_[0].strip() + ", " + x_[1].strip()
    elif len(x_) == 4:
        return x_[0].strip() + ", " + x_[2].strip()
    else:
        return x_[0].strip()

In [11]:
'''
This will keep brands and separate it from the rest of the titles with a ",". EX: "FRESH & EASY, CANOLA OIL"
'''
def remove_brand_descriptor(x):
    x_ = x.split(",")
    
    if len(x_) > 1:
        if 'fl oz' in x_[1]:
            return x_[0].strip()
        else:
            return x_[1].strip()
    elif len(x_) == 4:
        return x_[2].strip()
    else:
        return x_[0].strip()

In [12]:
'''
Apply the 2 methods to obtain two different columns with the cleaning process applied
'''
data['no_brand_descriptor_title'] = data.apply(lambda x: remove_brand_descriptor(x.long_name), axis=1)
data['no_descriptor_title'] = data.apply(lambda x: remove_descriptor(x.long_name), axis=1)

In [13]:
data.head()

Unnamed: 0,long_name,num_commas,no_brand_descriptor_title,no_descriptor_title
0,MOCHI ICE CREAM BONBONS,0,MOCHI ICE CREAM BONBONS,MOCHI ICE CREAM BONBONS
1,CHIPOTLE BARBECUE SAUCE,0,CHIPOTLE BARBECUE SAUCE,CHIPOTLE BARBECUE SAUCE
2,HOT & SPICY BARBECUE SAUCE,0,HOT & SPICY BARBECUE SAUCE,HOT & SPICY BARBECUE SAUCE
3,BARBECUE SAUCE,0,BARBECUE SAUCE,BARBECUE SAUCE
4,BARBECUE SAUCE,0,BARBECUE SAUCE,BARBECUE SAUCE


In [14]:
#remove leading symbols
edited_list = data.no_descriptor_title.tolist()
output = [re.sub('[![\]@#$*+\\\\/\'"}{)(]', '', x) for x in edited_list]
output2 = ["0"+x if x[0] == "." else x for x in output]
output3 = [x[1:] if x[0] == " " else x for x in output2]


print (len(output2))
data['no_descriptor_title'] = output3

238333


In [15]:
'''
Turn the dataframe's no brand and no descriptor titles into a list and clean out bad case
'''
no_brand_descrip = [x for x in list(set(data.no_brand_descriptor_title)) if x != ""]

In [16]:
'''
Turn the dataframe's no descriptor titles into a list and clean out bad case
'''
no_descrip = [x for x in list(set(data.no_descriptor_title)) if x != ""]

In [17]:
no_brand_descrip[:10]

['OPTIMUM SLIM CEREAL',
 'GUMMY BUNNY',
 'GERMAN STYLE PICKLES',
 'GIANT EASTER BUNNY',
 'NABULSI SEMI-SOFT CHEESE',
 "Kellogg's Special K Cereal Bars Dark Chocolate Salted Nut 5.5oz",
 'PINE NUTS (PIGNOLAS)',
 'HOT LINK',
 'FRUIT CUPS',
 'MERRY MINT MIX']

In [18]:
ADDITIONAL_DESCRIPTORS = [
    'ORGANIC',
    'NATURAL',
    'NATURALLY',
    'PREMIUM',
    'PURE',
    '100%',
    'FRESH SELECTIONS',
    'HOMESTYLE',
    '®',
    'IMPORTED',
    'QUALITY',
    'ALL',
    'HOME MADE',
    'HOME STYLE',
    'RICH',
    'ORIGINAL',
    'ENRICHED',
    'KOSHER',
]

with open('../data/descriptors.json', 'w') as outfile:
    json.dump(ADDITIONAL_DESCRIPTORS, outfile)

##remove everything with oz? and ct? and lb? and ounce? take out all gatordade entries?
UNITS = [
    'oz',
    'ounce',
    'ct',
    'lb',
]

In [19]:
def extra_descrip_cleaning(data):
    
    ADDITIONAL_DESCRIPTORS = [
        'ORGANIC',
        'NATURAL',
        'NATURALLY',
        'PREMIUM',
        'PURE',
        '100%',
        'FRESH SELECTIONS',
        'HOMESTYLE',
        '®',
        'IMPORTED',
        'QUALITY',
        'ALL',
        'HOME MADE',
        'HOME STYLE',
        'RICH',
        'ORIGINAL',
        'ENRICHED',
        'KOSHER',
    ]
    for new_descrip in ADDITIONAL_DESCRIPTORS:
        for idx, prod in enumerate(data):
            prod_split = prod.upper().split(" ")
            if new_descrip in prod_split:
                prod_split.remove(new_descrip)
                data[idx] = " ".join(prod_split)
    return data

In [20]:
def clean_units(data):
    ##remove everything with oz? and ct? and lb? and ounce? take out all gatordade entries?
    UNITS = [
        'OZ',
        'OUNCE',
        'CT',
        'LB',
    ]
    for unit in UNITS:
        for idx, prod in enumerate(data):
            split_prod = prod.upper().split(" ")
            if unit in split_prod:
                try:
                    for i, word in enumerate(split_prod):
                        if unit in split_prod[i]:
                            del split_prod[i]
                            del split_prod[i-1]
                            data[idx] = " ".join(split_prod)
                except Exception:
                    print (traceback.print_exc())
                    print ("failed on: ", split_prod)
    return data

In [21]:
def remove_gatorade(data):
    return [x for x in data if "GATORADE" not in x.upper()]

In [22]:
cleaned_no_brand_descrip = set(extra_descrip_cleaning(no_brand_descrip))
print ("Cleaned out %i product titles" %(len(no_brand_descrip) - len(cleaned_no_brand_descrip)))

cleaned_no_brand_descrip_2 = set(clean_units(list(cleaned_no_brand_descrip)))
print ("Cleaned out %i product titles" %(len(cleaned_no_brand_descrip) - len(cleaned_no_brand_descrip_2)))

cleaned_no_brand_descrip_3 = list(set(remove_gatorade(list(cleaned_no_brand_descrip_2))))
print ("Cleaned out %i product titles" %(len(cleaned_no_brand_descrip_2) - len(cleaned_no_brand_descrip_3)))

Cleaned out 6204 product titles
Cleaned out 541 product titles
Cleaned out 189 product titles


In [23]:
cleaned_no_brand_descrip_3 = [x for x in cleaned_no_brand_descrip_3 if x != ""]
print (len(cleaned_no_brand_descrip_3))

105730


In [25]:
'''
Save the lists as a json file
'''

with open('../data/cleaned_branded_data.json', 'w') as outfile:
    json.dump([x.replace(',', '') for x in no_descrip], outfile)
    
with open('../data/cleaned_data.json', 'w') as outfile:
    json.dump(cleaned_no_brand_descrip_3, outfile)