# Dataset Explore and Merge

In [1]:
import os
import re
import numpy as np
import pandas as pd

# stop words list
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

# disable SettingWithCopyWarning 
pd.options.mode.chained_assignment = None # default='warn'

## Open Food Facts dataset


In [2]:
# set dtype of code to keep values starting with 0
# set dtype of others to avoid DtypeWarning
data_path = 'D:\DATA\practice-dataset\zipped'
off = pd.read_csv(os.path.join(data_path, 'en.openfoodfacts.org.products.csv.zip'), \
                  dtype={'code': 'object', 
                         'emb_codes': 'object', 'emb_codes_tags': 'object',
                         'first_packaging_code_geo': 'object',
                         'cities_tags': 'object', 'additives': 'object',
                         'ingredients_from_palm_oil_tags': 'object'}, \
                  compression='zip', sep='\t')
# data_path = '/kaggle/input/'
# off = pd.read_csv(os.path.join(data_path, 'en.openfoodfacts.org.products.csv'), \
#                   dtype={'code': 'object', 
#                          'emb_codes': 'object', 'emb_codes_tags': 'object',
#                          'first_packaging_code_geo': 'object',
#                          'cities_tags': 'object', 'additives': 'object',
#                          'ingredients_from_palm_oil_tags': 'object'}, \
#                   sep='\t')
off.shape

(949695, 175)

In [3]:
# drop columns not needed for cross-analysis with reviews
dropped_cols = ['creator', 'created_t', 'created_datetime', \
                 'last_modified_t', 'last_modified_datetime', \
                 'generic_name', 'packaging', 'packaging_tags', \
                 'origins', 'origins_tags', \
                 'manufacturing_places', 'manufacturing_places_tags', \
                 'labels', 'emb_codes', 'emb_codes_tags', \
                 'first_packaging_code_geo', 'cities', 'cities_tags', \
                 'purchase_places', 'stores', 'countries', \
                 'ingredients_text', 'traces']
# 'categories',

In [4]:
# drop columns not used for product review
off.drop(dropped_cols, axis=1, inplace=True)
# filter out url columns (columns names containing 'url')
off = off.filter(regex=r'^((?!url).)*$', axis=1)
off.shape

(949695, 145)

In [5]:
# drop the rows without Product Name
off = off[off.product_name.notna()].reset_index(drop=True)
off.shape

(872540, 145)

### Open Food Facts subset

#### Take product of "Jif" for example

In [6]:
# product of Jif
jif = off[off.product_name.str.match(r'^(JIF|Jif|jif)\s.*')]
jif.product_name.head()

69980                       Jif Crema de Cacahuate Cremosa
69988               Jif Creme De Amdoim C / Chocolate 450G
69989             Jif Natural Crunchy Peanut Butter Spread
69994    Jif Cookies N Cream and Hazelnut Pate à Tartinner
69997                             Jif Peanut Butter Creamy
Name: product_name, dtype: object

In [7]:
jif.drop('categories', axis=1, inplace=True)
# jif.product_name = jif.product_name.str.lower() # lowercase when extracting tokens
jif.reset_index(drop=True, inplace=True)
jif.shape

(15, 144)

#### Take product of "Cheetos" for example

In [8]:
# Product of Cheetos
cheetos = off[off.product_name.str.match(r'^(Cheetos|CHEETOS|cheetos)\s.*')]
cheetos.drop('categories', axis=1, inplace=True)

# cheetos.product_name = cheetos.product_name.str.lower() 
cheetos.reset_index(drop=True, inplace=True)
cheetos.shape

(78, 144)

## Amazon Reviews: Grocery dataset
+ https://registry.opendata.aws/amazon-reviews/
+ https://s3.amazonaws.com/amazon-reviews-pds/readme.html
+ http://jmcauley.ucsd.edu/data/amazon/

In [9]:
data_path = 'D:\DATA\practice-dataset\gzipped'
amz = pd.read_csv(os.path.join(data_path, 'amazon_reviews_us_Grocery_v1_00.tsv.gz'), \
                  dtype={'customer_id': 'object', 'product_parent': 'object', \
                         'star_rating': 'object', 
                         'helpful_votes': pd.Int64Dtype(), 'total_votes': pd.Int64Dtype()}, \
                  compression='gzip', sep='\t', \
                  error_bad_lines=False, warn_bad_lines=False)
# data_path = '/kaggle/input/amazon_reviews_us_grocery_v1_00.tsv'
# amz = pd.read_csv(os.path.join(data_path, 'amazon_reviews_us_Grocery_v1_00.tsv'), \
#                   dtype={'customer_id': 'object', 'product_parent': 'object', \
#                          'star_rating': 'object', 
#                          'helpful_votes': 'object', 'total_votes': 'object'}, \
#                   sep='\t', \
#                   error_bad_lines=False, warn_bad_lines=False)

# pd.Int64Dtype() allows NaN
amz.drop(['marketplace', 'product_category', 'product_id'], axis=1, inplace=True)
# row 1841896 contains date as star_rating
amz.drop(1841896, axis=0, inplace=True)
amz.shape

(2393378, 12)

### Amazon Review subset

#### Take product of "jif" for example

In [14]:
# Jif product reviews
jif_rev = amz[amz.product_title.str.match(r'^(JIF|Jif|jif)\s.*')]
# jif_rev.product_title = jif_rev.product_title.str.lower()
jif_rev.reset_index(drop=True, inplace=True)
jif_rev.shape

(1413, 12)

In [15]:
jif_rev.product_title.unique()[:10]

array(['Jif Chocolate Nut Butter Almond Granola Bars, 5 Count Pack',
       'Jif Creamy Peanut Butter Granola Bar, 5 Ct',
       'Jif Creamy Peanut Butter Twin Pack, 80 Ounce',
       'Jif Whips Whipped Peanut Butter & Salty Caramel',
       'Jif Cashew Butter, Creamy, 12 Ounce',
       'Jif To Go Dippers with Pretzels, 3 Count',
       'Jif Peanut Powder, 6.5 Ounce', 'Jif Creamy Peanut Butter',
       'Jif Cashew Butter, Crunchy, 12 Ounce', 'Jif Peanut Butter'],
      dtype=object)

#### Take product of "cheetos" for example

In [16]:
# cheetos
che_rev = amz[amz.product_title.str.match(r'^(Cheetos|cheetos|CHEETOS)\s.*')]
# che_rev.product_title = che_rev.product_title.str.lower()
che_rev.reset_index(drop=True, inplace=True)
che_rev.shape

(496, 12)

In [17]:
che_rev.product_title.unique()[:10]

array(['Cheetos Crunchy - 50/1 oz. bags',
       'Cheetos Flavored Snacks, Crunchy Cheese, 1.13 Ounce (Pack of 12)',
       'Cheetos Crunchy Cheddar Jalapeno Cheese Flavored Snacks',
       'Cheetos Crunchy Cheese Flavored Snacks',
       "Cheetos Flamin' Hot and Doritos Dinamita Chile Limon 8.0 Oz [3 Pk]",
       "Cheetos Flamin' Hot - 50/1 oz",
       'Cheetos Sweetos Cinnamon Sugar Puffs Flavored Snacks, 7 oz (Set of 2)',
       "Cheetos Cheese Flavored Snacks, Crunchy Flamin' Hot, 2.38 Ounce (Pack of 12)",
       'Cheetos Cheese Flavored Snacks, Jumbo Puffs, 9.5 Ounce (Pack of 4)',
       'Cheetos Natural White Cheddar Puffs Cheese Flavored Snacks, 8oz Bags (Pack of 12)'],
      dtype=object)

### Get mapping from `product_parent` code
+ key: product_parent
+ value: product title/name
+ how?
  + group by product_parent and product_title, count the occurance of another column
    + getting multi-index with product_parent and product_title, with only columnt of count
  + `reset_index` on the multi-index dataframe, get regular data frame
  + method1:
    + sort by count values, from large to small; drop duplicates on product_parent
    + get the unique product_parent code for each product_title
  + method2:
    + get index by 
      + group by prodcut_parent and transform each row to the group's max value
      + compare with group max value, the boolean array is the index
    + get the unique pair by boolean slicing on array

In [18]:
# group by product_parent and product_title, get count of each title under a code
# there could be multiple titles under the same code
tmp = amz.loc[:, ['product_title', 'product_parent', 'customer_id']]\
        .groupby(['product_parent', 'product_title']).count().reset_index()
tmp.shape

(275498, 3)

In [19]:
# method 1
mapping = tmp.sort_values('customer_id', ascending=False).drop_duplicates('product_parent')
# mapping.shape
mapping = mapping.sort_values('product_parent').drop('customer_id', axis=1).reset_index(drop=True)
mapping.head()

Unnamed: 0,product_parent,product_title
0,100000634,"Wild Caught Icelandic Cod, Frozen Cello Pak5 l..."
1,100007845,Pamelas Cookie Fgg&Jmms Bluebry&Fig Ko
2,100011767,Hidden Valley Fat Free Ranch Portion Pack Dres...
3,100013042,Prize Winning La Tourangelle Artisinal Gourmet...
4,100016462,Sharwood's Plain Large Puppodums (8 per pack -...


In [20]:
# export the mapping
data_path = 'D:\DATA\OurFoods'
mapping.to_csv(os.path.join(data_path, 'mapping.csv'), 
               index=False)

## Tokenize Product Name/Title
+ Regex for processing names/titles
  + lowercasing
  + remove non-word but not white space, b.c, special symbols when naming
  + remove digit and values after it, b.c. values after digits are packaging size
  + remove space, i.e. empty string, in list
  + remove stopwords, e.g. 'by', 'the'...etc
+ **Problem with Regex**
  + many product names/titles starting with digit
    + causing too many empty tuples

In [21]:
jif.shape, jif_rev.shape, cheetos.shape, che_rev.shape

((15, 144), (1413, 12), (78, 144), (496, 12))

In [22]:
def pname_tokenize(string):
    """
    Given product name/title string, processes and outputs tuple of tokens
    """
    # lower and remove non-word except spaces
    r = re.sub(r'[^\w\s]', '', string.lower())
    # remove digits and any string after it
    r = re.sub(r'\d.*$', '', r)
    # remove empty string and stopwords, then return tuple
    return tuple(sorted(set(filter(None, r.split(' '))) - set(stopWords)))

In [23]:
jif['tokens'] = jif.product_name.apply(pname_tokenize)
jif_rev['tokens'] = jif_rev.product_title.apply(pname_tokenize)
jif.shape, jif_rev.shape

((15, 145), (1413, 13))

In [24]:
cheetos['tokens'] = cheetos.product_name.apply(pname_tokenize)
che_rev['tokens'] = che_rev.product_title.apply(pname_tokenize)
cheetos.shape, che_rev.shape

((78, 145), (496, 13))

In [25]:
mapping['tokens'] = mapping.product_title.apply(pname_tokenize)
mapping.head()

Unnamed: 0,product_parent,product_title,tokens
0,100000634,"Wild Caught Icelandic Cod, Frozen Cello Pak5 l...","(caught, cello, cod, frozen, icelandic, pak, w..."
1,100007845,Pamelas Cookie Fgg&Jmms Bluebry&Fig Ko,"(bluebryfig, cookie, fggjmms, ko, pamelas)"
2,100011767,Hidden Valley Fat Free Ranch Portion Pack Dres...,"(dressing, fat, free, hidden, pack, portion, r..."
3,100013042,Prize Winning La Tourangelle Artisinal Gourmet...,"(artisinal, gourmet, la, oil, prize, tourangel..."
4,100016462,Sharwood's Plain Large Puppodums (8 per pack -...,"(large, plain, puppodums, sharwoods)"


## Dataset Merge
+ AMZ dataset
  + merge with mapping on unique id, to add tokens to amz
+ OFF dataset
  + tokenize the product name
    + but product name may be duplicated, with same token
    + group by the token, extract only the rows of product name with max counts
  + use mapping to find unique id for the token
+ Merge
  + merge both on unique id (product parent)

In [26]:
off.shape, amz.shape, mapping.shape

((872540, 145), (2393378, 12), (267725, 3))

### Prepare `amz`
+ using mapping dataset
+ merget `amz` with mapping to get `tokens` attribute
+ some `tokens` are empty, drop by empty tuple

In [27]:
# merge amz with mapping
amz = amz.merge(mapping[['product_parent', 'tokens']], how='left', on='product_parent')
amz.shape

(2393378, 13)

In [28]:
# drop empty tuples, for now
amz = amz[amz.tokens != tuple()]
amz.shape

(2344543, 13)

### Prepare `off`
+ problems:
  + same product, having different pacakge size, is on different row
  + i.e. same tokens, but having multiple entries
+ either select one of the entries, or taking avearage on all entries
+ 

In [29]:
off.shape

(872540, 145)

In [30]:
# take mean on duplicate product names
temp = off.groupby('product_name', as_index=False).mean()\
    .assign(tokens=lambda d: d.product_name.apply(pname_tokenize))
# take mean on duplicated tokens
temp = temp[temp.tokens != tuple()].reset_index(drop=True)\
    .groupby('tokens', as_index=False).mean()
# remove rows without energy data
temp = temp[temp.energy_100g.notna()]

In [31]:
temp.tokens.value_counts()
# duplicate tokens

(ficaccia, genovese)                             1
(basilic, croustillants, tomates)                1
(biocrunchymüsli, mandelorange)                  1
(crackers, emmental, feuilletés)                 1
(au, et, lorange, miel, nonnettes)               1
                                                ..
(bio, chocolat, lait, maïs)                      1
(blanco, chocolate, mercadona)                   1
(beurre, chocolat, lait, le, petit, tablette)    1
(desayuno, integral)                             1
(crème, de, et, grillés, poivrons, sésame)       1
Name: tokens, Length: 440695, dtype: int64

In [32]:
temp.shape

(440695, 117)

### Merge and Output both dataset

In [48]:
df = amz.merge(temp, how='inner', on='tokens')
df.shape

(147304, 129)