In [1]:
'''
import libraries and set print options
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
pd.set_option('display.max_rows', 1000)

# Exploratory Data Analysis - DISREGARD

In [2]:
data = pd.read_csv('../data/receipt_data_manual.csv') ##read csv as a dataframe
full_data = data.copy()
data.head() ## show the top 5

Unnamed: 0,receipt_name,full_product_title,general_product_title,store,database,no_brand_descriptor_title,no_descriptor_title
0,ORGANIC GALA APPLE,organic gala apples,apples,publix,Organic gala apples,Organic gala apples,Organic gala apples
1,ACT STRWBY/BLUEBRY,activia probiotic yogurt,yogurt,publix,DANNON ACTIVIA FIBER PROBIOTIC LOWFAT YOGURT S...,DANNON ACTIVIA FIBER PROBIOTIC LOWFAT YOGURT S...,DANNON ACTIVIA FIBER PROBIOTIC LOWFAT YOGURT S...
2,OREO THINS PISTACH,oreo thins pistachio,oreos,publix,OREO THINS COOKIES PISTACHIO 1X10.1 OZ,OREO THINS COOKIES PISTACHIO 1X10.1 OZ,OREO THINS COOKIES PISTACHIO 1X10.1 OZ
3,OG GW BABY SPINACH,organic baby spinach,spinach,publix,ORGANIC BABY SPINACH,ORGANIC BABY SPINACH,ORGANIC BABY SPINACH
4,PAPAYA MARADOL,papaya,papaya,publix,PAPAYA CHUNKS,PAPAYA CHUNKS,PAPAYA CHUNKS


In [3]:
for i in range(len(data.database)):
    if (pd.isnull(data.database[i])):
        data.database[i]=''
    else:
        data.database[i] = data.database[i].strip()

In [4]:
titles = data.database.tolist() ## get the long_name column into a list
len(titles) ## print the length

448

In [5]:
possible_brands = [x.split(",")[0] for x in titles if len(x.split(",")) > 1] ## what is the 0th index for items with commas
possible_brands[:10]

['LINDT',
 "GT'S",
 'LACROIX',
 'LACROIX',
 'Coca-Cola Zero Sugar Bottle',
 'Sprite Bottle',
 'Coca-Cola Cherry Bottles',
 "CARROLL SHELBY'S",
 'RED BARRON',
 'TERANA']

In [6]:
possible_brands = [x for x in titles if len(x.split(",")) == 4]
print (len(possible_brands))
possible_brands

2


['RED BARRON, PIZZA, SAUSAGE, SUPREME', 'BARCEL, TAKIS, TORTILLA CHIPS, HOT']

## RULES:
### - The zeroth index seems to be brand for the most part (few exceptions)
### - When there are 3 items, the first index is the main product with the 2nd index being a descriptor/flavor/type
### - When there are 4 items (only ~4k examples), the 2nd index is the product, the 0th index is still brand, the 3rd index is the descriptor

# Data Cleaning - Process Is Commented

In [7]:
if 'no_brand_descriptor_title' in data.columns:
    DROP_COLS = [
        'receipt_name',
        'full_product_title',
        'general_product_title',
        'store',
        'no_brand_descriptor_title', 
        'no_descriptor_title'
    ]
else:
    DROP_COLS = [
        'receipt_name',
        'full_product_title',
        'general_product_title',
        'store'
    ]
data = data.drop(DROP_COLS, axis=1) ## drop unnecessary columns for now
data.head()

Unnamed: 0,database
0,Organic gala apples
1,DANNON ACTIVIA FIBER PROBIOTIC LOWFAT YOGURT S...
2,OREO THINS COOKIES PISTACHIO 1X10.1 OZ
3,ORGANIC BABY SPINACH
4,PAPAYA CHUNKS


In [8]:
'''
Find the number of commas in each title and only use those with less than 5
'''
data['num_commas'] = data.apply(lambda x: len(x.database.split(","))-1, axis=1)

# data = data[data['num_commas'] < 4]
for index in range(len(data.database)):
    if (data.num_commas[index] >= 4):
        data.database[index] = ''

In [9]:
len(data)

448

In [10]:
'''
This will keep brands and separate it from the rest of the titles with a ",". EX: "FRESH & EASY, CANOLA OIL"
'''
def remove_descriptor(x):
    x_ = x.split(",")
    if len(x_) == 2 or len(x_) == 3:
        if 'fl oz' in x_[1]:
            return x_[0].strip()
        else:
            return x_[0].strip() + ", " + x_[1].strip()
    elif len(x_) == 4:
        return x_[0].strip() + ", " + x_[2].strip()
    else:
        return x_[0].strip()

In [11]:
'''
This will keep brands and separate it from the rest of the titles with a ",". EX: "FRESH & EASY, CANOLA OIL"
'''
def remove_brand_descriptor(x):
    x_ = x.split(",")
    
    if len(x_) > 1:
        if 'fl oz' in x_[1]:
            return x_[0].strip()
        else:
            return x_[1].strip()
    elif len(x_) == 4:
        return x_[2].strip()
    else:
        return x_[0].strip()

In [12]:
'''
Apply the 2 methods to obtain two different columns with the cleaning process applied
'''
data['no_brand_descriptor_title'] = data.apply(lambda x: remove_brand_descriptor(x.database), axis=1)
data['no_descriptor_title'] = data.apply(lambda x: remove_descriptor(x.database), axis=1)

In [13]:
data.tail() ##print dataframe

Unnamed: 0,database,num_commas,no_brand_descriptor_title,no_descriptor_title
443,SEA SCALLOPS,0,SEA SCALLOPS,SEA SCALLOPS
444,PHILADELPHIA CREAM CHEESE-SOFT LIGHT,0,PHILADELPHIA CREAM CHEESE-SOFT LIGHT,PHILADELPHIA CREAM CHEESE-SOFT LIGHT
445,,0,,
446,"NESTLE, GRAND ASSORTED CHOCOLATE",1,GRAND ASSORTED CHOCOLATE,"NESTLE, GRAND ASSORTED CHOCOLATE"
447,PHILADELPHIA CREAM CHEESE-SOFT LIGHT,0,PHILADELPHIA CREAM CHEESE-SOFT LIGHT,PHILADELPHIA CREAM CHEESE-SOFT LIGHT


In [14]:
'''
Get rid of leading and ending spaces, make all one case, in csv file
'''
        
for i in range(len(full_data.database)):
    full_data.receipt_name[i] = '' if pd.isnull(full_data.receipt_name[i]) else (full_data.receipt_name[i].upper()).strip()
    full_data.full_product_title[i] = '' if pd.isnull(full_data.full_product_title[i]) else (full_data.full_product_title[i].lower()).strip()
    full_data.general_product_title[i] = '' if pd.isnull(full_data.general_product_title[i]) else (full_data.general_product_title[i].lower()).strip()
    full_data.store[i] = '' if pd.isnull(full_data.store[i]) else (full_data.store[i].lower()).strip()

In [15]:
for i in range(len(full_data.database)):
    if (data.database[i] == ''):
        print([full_data.receipt_name[i], full_data.full_product_title[i], full_data.general_product_title[i], full_data.store[i], data.database[i], data.no_brand_descriptor_title[i], data.no_descriptor_title[i]])
   

['PF XTRA CHEDDAR GF', 'pepperidge farms xtra cheddar goldfish', 'goldfish', 'publix', '', '', '']
['PF CHEESE GOLDFISH', 'pepperidge farms cheese goldfish', 'goldfish', 'publix', '', '', '']
['REESES PEAN', 'reeses peanut butter cups', 'peanut butter cups', 'walmart', '', '', '']
['ATHENOS TR/CR FETA', 'athenos traditional crumbled feta', 'publix', 'feta crumbled', '', '', '']
['L/L DBL CHOC CHIP', 'lenny & larrys double chocolate chip', 'double chocolate chip', 'publix', '', '', '']
['AIDL SAUSAGE', 'aidell sausage', 'sausage', 'kroger', '', '', '']
['PLANTAINS', 'plantains', 'plantains', 'kroger', '', '', '']
['PFRM GOLDFISH', 'pepperidge farm goldfish', 'goldfish', 'kroger', '', '', '']
['PEPPERS BELL RED', 'red bell peppers', 'red bell peppers', 'kroger', '', '', '']
['PEACHES WHITE', 'white peaches', 'white peaches', 'kroger', '', '', '']
['BLUEBERRIES', 'blueberries', 'blueberries', 'kroger', '', '', '']
['LEMONS', 'lemons', 'lemons', 'kroger', '', '', '']
['QUAKER OATMEAL', 'qu

In [18]:
'''
Create csv rows where there is a product match. Put all unmatched products in a separate file
'''
csvRows = []
no_match_row = []
csvRows.append(['receipt_name', 'full_product_title', 'general_product_title', 'store', 'database', 'no_brand_descriptor_title', 'no_descriptor_title'])
for i in range(len(full_data.database)):
    if (data.database[i] == ''):
        no_match_row.append([full_data.receipt_name[i], full_data.full_product_title[i], full_data.general_product_title[i], full_data.store[i]])
    else:
        csvRows.append([full_data.receipt_name[i], full_data.full_product_title[i], full_data.general_product_title[i], full_data.store[i], data.database[i], data.no_brand_descriptor_title[i], data.no_descriptor_title[i]])
    

In [19]:
'''
Create new, cleaned csv of manual data
'''
csvfile = '../data/cleaned_receipt_data_manual.csv'
with open(csvfile, "w") as fp:
    wr = csv.writer(fp, dialect='excel')
    wr.writerows(csvRows)

In [20]:
'''
Create csv of manual data with no matches
'''
csvfile = '../data/no_matches_manual_data.csv'
with open(csvfile, "w") as fp:
    wr = csv.writer(fp, dialect='excel')
    wr.writerows(no_match_row)