# Merge Datasets: AMZ and OFF

In [1]:
import os
import re
import numpy as np
import pandas as pd

# use english stop words list
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

# disable SettingWithCopyWarning 
pd.options.mode.chained_assignment = None # default='warn'

## Open Food Facts dataset


In [2]:
# set dtype of code to keep values starting with 0, set dtype of others to avoid DtypeWarning
data_path = 'D:\DATA\practice-dataset\zipped'
off = pd.read_csv(os.path.join(data_path, 'en.openfoodfacts.org.products.csv.zip'), \
                  dtype={'code': 'object', 
                         'emb_codes': 'object', 'emb_codes_tags': 'object',
                         'first_packaging_code_geo': 'object',
                         'cities_tags': 'object', 'additives': 'object',
                         'ingredients_from_palm_oil_tags': 'object'}, \
                  compression='zip', sep='\t')
off.shape

(949695, 175)

### Handle OFF: unnecessary attributes
+ Remove non-needed attributes

In [3]:
# drop attributes not related not related to reviews analyses
dropped_cols = ['creator', 'created_t', 'created_datetime', \
                 'last_modified_t', 'last_modified_datetime', \
                 'generic_name', 'packaging', 'packaging_tags', \
                 'origins', 'origins_tags', \
                 'manufacturing_places', 'manufacturing_places_tags', \
                 'labels', 'emb_codes', 'emb_codes_tags', \
                 'first_packaging_code_geo', 'cities', 'cities_tags', \
                 'purchase_places', 'stores', 'countries', \
                 'ingredients_text', 'traces']
# 'categories',

In [4]:
# drop columns not used for product review
off.drop(dropped_cols, axis=1, inplace=True)

In [5]:
# filter out url columns (columns names containing 'url')
off = off.filter(regex=r'^((?!url).)*$', axis=1)

In [6]:
# drop the rows without Product Name
off = off[off.product_name.notna()].reset_index(drop=True)

In [7]:
off.shape

(872540, 145)

### Open Food Facts samples

#### Take product of "Cheetos" for example

In [8]:
# Product of Cheetos
cheetos = off[off.product_name.str.match(r'^(Cheetos|CHEETOS|cheetos)\s.*')]
cheetos.drop('categories', axis=1, inplace=True)

# cheetos.product_name = cheetos.product_name.str.lower() 
cheetos.reset_index(drop=True, inplace=True)
cheetos.shape

(78, 144)

## Amazon Reviews: Grocery dataset
+ https://registry.opendata.aws/amazon-reviews/
+ https://s3.amazonaws.com/amazon-reviews-pds/readme.html
+ http://jmcauley.ucsd.edu/data/amazon/

In [9]:
data_path = 'D:\DATA\practice-dataset\gzipped'
amz = pd.read_csv(os.path.join(data_path, 'amazon_reviews_us_Grocery_v1_00.tsv.gz'), \
                  dtype={'customer_id': 'object', 'product_parent': 'object', \
                         'star_rating': 'object', 
                         'helpful_votes': pd.Int64Dtype(), 'total_votes': pd.Int64Dtype()}, \
                  compression='gzip', sep='\t', \
                  error_bad_lines=False, warn_bad_lines=False)
# pd.Int64Dtype() allows NaN
amz.drop(['marketplace', 'product_category', 'product_id'], axis=1, inplace=True)
# row 1841896 contains date as star_rating
amz.drop(1841896, axis=0, inplace=True)
amz.shape

(2393378, 12)

In [10]:
# one product_parent, but multiple title; most others only have one title
amz[amz.product_parent == '795563511'].product_title.unique()

array(['Grove Square Cappuccino, Single Serve Cup for Keurig K-Cup Brewers',
       'Grove Square Cappuccino Single Serve Cappuccino Cups, Hazelnut, K-Cups for Keurig Brewers, 24 ct',
       'Grove Square Cappuccino, Caramel, 24-Count Single Serve Cup for Keurig K-Cups',
       'Grove Square Cappuccino Single Serve Cappuccino Cups, Hazelnut, Single serve cups for Keurig Brewers, 24 ct',
       'Grove Square Cappuccino, French Vanilla, 24-Count for Keurig K-cup Brewers'],
      dtype=object)

### Amazon Review sample

#### Take product of "cheetos" for example

In [11]:
# cheetos
che_rev = amz[amz.product_title.str.match(r'^(Cheetos|cheetos|CHEETOS)\s.*')]
# che_rev.product_title = che_rev.product_title.str.lower()
che_rev.reset_index(drop=True, inplace=True)
che_rev.shape

(496, 12)

In [12]:
# view the first 10
che_rev.product_title.unique()[:10]

array(['Cheetos Crunchy - 50/1 oz. bags',
       'Cheetos Flavored Snacks, Crunchy Cheese, 1.13 Ounce (Pack of 12)',
       'Cheetos Crunchy Cheddar Jalapeno Cheese Flavored Snacks',
       'Cheetos Crunchy Cheese Flavored Snacks',
       "Cheetos Flamin' Hot and Doritos Dinamita Chile Limon 8.0 Oz [3 Pk]",
       "Cheetos Flamin' Hot - 50/1 oz",
       'Cheetos Sweetos Cinnamon Sugar Puffs Flavored Snacks, 7 oz (Set of 2)',
       "Cheetos Cheese Flavored Snacks, Crunchy Flamin' Hot, 2.38 Ounce (Pack of 12)",
       'Cheetos Cheese Flavored Snacks, Jumbo Puffs, 9.5 Ounce (Pack of 4)',
       'Cheetos Natural White Cheddar Puffs Cheese Flavored Snacks, 8oz Bags (Pack of 12)'],
      dtype=object)

### Handle AMZ: multiple product_name in a product_parent 
Get mapping from `product_parent` code
+ why? need a unique identifier for merging, but `product_parent` has some errors
  + from each `product_parent`, get one product title as the only title
  + i.e. get the highest count title from each `product_parent` code
+ key: product_parent
+ value: product title/name
+ how?
  + group by product_parent and product_title, count the occurance of another column
    + getting multi-index with product_parent and product_title, with only columnt of count
  + `reset_index` on the multi-index dataframe, get regular data frame
  + method1:
    + sort by count values, from large to small; drop duplicates on product_parent
    + get the unique product_parent code for each product_title
  + method2:
    + get index by 
      + group by prodcut_parent and transform each row to the group's max value
      + compare with group max value, the boolean array is the index
    + get the unique pair by boolean slicing on array

In [13]:
# group by product_parent and product_title, get count of each title under a code
# there could be multiple titles under the same code
tmp = amz.loc[:, ['product_title', 'product_parent', 'customer_id']]\
        .groupby(['product_parent', 'product_title']).count().reset_index()
tmp.shape

(275498, 3)

In [14]:
# method 1
mapping = tmp.sort_values('customer_id', ascending=False).drop_duplicates('product_parent')
# mapping.shape
mapping = mapping.sort_values('product_parent').drop('customer_id', axis=1).reset_index(drop=True)
mapping.head()

Unnamed: 0,product_parent,product_title
0,100000634,"Wild Caught Icelandic Cod, Frozen Cello Pak5 l..."
1,100007845,Pamelas Cookie Fgg&Jmms Bluebry&Fig Ko
2,100011767,Hidden Valley Fat Free Ranch Portion Pack Dres...
3,100013042,Prize Winning La Tourangelle Artisinal Gourmet...
4,100016462,Sharwood's Plain Large Puppodums (8 per pack -...


method 2, (incorrect, still some duplicated product_parent)
```python
idx = tmp.groupby('product_parent')['customer_id'].transform(max) == tmp['customer_id']
mapping = tmp[idx]
mapping.shape
```

export the mapping
```python
data_path = 'D:\DATA\OurFoods'
mapping.to_csv(os.path.join(data_path, 'mapping.csv'), index=False)
```

## Tokenize Product Name/Title
+ Regex for processing names/titles
  + lowercasing
  + remove non-word but not white space, b.c, special symbols when naming
  + remove digit and values after it, b.c. values after digits are packaging size
  + remove space, i.e. empty string, in list
  + remove stopwords, e.g. 'by', 'the'...etc
+ **Problem with Regex**
  + many product names/titles starting with digit
    + causing too many empty tuples

In [15]:
def pname_tokenize(string):
    """
    Given product name/title string, processes and outputs tuple of tokens
    """
    # lower and remove non-word except spaces
    r = re.sub(r'[^\w\s]', '', string.lower())
    # remove digits and any string after it
    r = re.sub(r'\d.*$', '', r)
    # remove empty string and stopwords, then return tuple
    return tuple(sorted(set(filter(None, r.split(' '))) - set(stopWords)))

In [16]:
cheetos['tokens'] = cheetos.product_name.apply(pname_tokenize)
che_rev['tokens'] = che_rev.product_title.apply(pname_tokenize)
cheetos.shape, che_rev.shape

((78, 145), (496, 13))

In [17]:
# adding tokens to mapping dataframe
mapping['tokens'] = mapping.product_title.apply(pname_tokenize)
mapping.head()

Unnamed: 0,product_parent,product_title,tokens
0,100000634,"Wild Caught Icelandic Cod, Frozen Cello Pak5 l...","(caught, cello, cod, frozen, icelandic, pak, w..."
1,100007845,Pamelas Cookie Fgg&Jmms Bluebry&Fig Ko,"(bluebryfig, cookie, fggjmms, ko, pamelas)"
2,100011767,Hidden Valley Fat Free Ranch Portion Pack Dres...,"(dressing, fat, free, hidden, pack, portion, r..."
3,100013042,Prize Winning La Tourangelle Artisinal Gourmet...,"(artisinal, gourmet, la, oil, prize, tourangel..."
4,100016462,Sharwood's Plain Large Puppodums (8 per pack -...,"(large, plain, puppodums, sharwoods)"


## Dataset Merge 
+ keep cateogory and main nutrients
+ AMZ dataset
  + merge with mapping on unique id, to add tokens to amz
+ OFF dataset
  + tokenize the product name
  + use mapping to find unique id for the token
+ Merge
  + merge both on unique id (product parent)

### Prepare `amz`
+ using mapping dataset
+ merget `amz` with mapping to get `tokens` attribute
+ some `tokens` are empty, drop by empty tuple

In [18]:
# merge amz with mapping
amz = amz.merge(mapping[['product_parent', 'tokens']], how='left', on='product_parent')
amz.shape

(2393378, 13)

In [19]:
# drop empty tuples, for now
amz = amz[amz.tokens != tuple()]
amz.shape

(2344543, 13)

### Prepare `off`
+ problems:
  + same product, having different pacakge size, is on different row
  + i.e. same tokens, but having multiple entries
+ either select one of the entries, or taking avearage on all entries
+ 

In [20]:
off.shape

(872540, 145)

In [21]:
# only taking categories and main nutrients
sub = off.loc[:, ['product_name', 'categories_tags', 'main_category_en', \
                  'energy_100g', 'fat_100g', 'fiber_100g', 'carbohydrates_100g', \
                  'proteins_100g', 'salt_100g', 'sodium_100g', 'sugars_100g']]
sub.sort_values(['product_name', 'main_category_en', 'categories_tags'], \
                inplace=True, na_position='last')
sub.reset_index(drop=True, inplace=True)
sub.shape

(872540, 11)

In [22]:
# main category has missing values 
sub.main_category_en.notna().sum()

337571

In [23]:
# aggregate function for each attribute
func = {'categories_tags':'last', 'main_category_en':'last', \
        'energy_100g':'mean', 'fat_100g':'mean', 'fiber_100g':'mean', \
        'carbohydrates_100g':'mean', 'proteins_100g':'mean', \
        'salt_100g':'mean', 'sodium_100g':'mean', 'sugars_100g':'mean'}
# for duplicated product_name: 
# 1. take first value on strings attribute
# 2. take mean on numeric attrbutes
sub = sub.groupby('product_name', as_index=False).agg(func)\
        .assign(tokens=lambda d: d.product_name.apply(pname_tokenize))

In [24]:
# for duplicated tokens
# use the same aggregate function as before
sub = sub[sub.tokens != tuple()].reset_index(drop=True)\
    .groupby('tokens', as_index=False).agg(func)
# remove rows without energy data
sub = sub[sub.energy_100g.notna()]

# sub.head()

### Merge and Output both dataset
+ need to process the main category, having different language inputs
  + remove non-english inputs
  + or, replace with english categories

In [25]:
df = amz.merge(sub, how='inner', on='tokens')
df.shape

(147304, 23)

export as csv file
```python
data_path = 'D:\DATA\OurFoods'
df.to_csv(os.path.join(data_path, 'merged_amz-off_3.csv.gz'),\
          compression='gzip', index=False)
```

export to database
```python
import os
from sqlalchemy import create_engine
from sqlalchemy import types

from dotenv import load_dotenv # env variables
load_dotenv(verbose=True)

SQLALCHEMY_DATABASE_URI = os.getenv('DATABASE_URL')
engine = create_engine(SQLALCHEMY_DATABASE_URI)

df.to_sql(name="food_reviews", con=engine, if_exists='replace',
               schema='public', index=True)
```