# Explore Dataset Individually

In [1]:
import os
import re
import numpy as np
import pandas as pd

# stop words list
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

# disable SettingWithCopyWarning 
pd.options.mode.chained_assignment = None # default='warn'

## Open Food Facts dataset
+ https://world.openfoodfacts.org/data

In [2]:
# set dtype of code to keep values starting with 0, set dtype of others to avoid DtypeWarning
data_path = 'D:\DATA\practice-dataset\zipped'
off = pd.read_csv(os.path.join(data_path, 'en.openfoodfacts.org.products.csv.zip'), \
                  dtype={'code': 'object', 
                         'emb_codes': 'object', 'emb_codes_tags': 'object',
                         'first_packaging_code_geo': 'object',
                         'cities_tags': 'object', 'additives': 'object',
                         'ingredients_from_palm_oil_tags': 'object'}, \
                  compression='zip', sep='\t')

# IF using kaggle server
# data_path = '/kaggle/input/'
# off = pd.read_csv(os.path.join(data_path, 'en.openfoodfacts.org.products.csv'), \
#                   dtype={'code': 'object', 
#                          'emb_codes': 'object', 'emb_codes_tags': 'object',
#                          'first_packaging_code_geo': 'object',
#                          'cities_tags': 'object', 'additives': 'object',
#                          'ingredients_from_palm_oil_tags': 'object'}, \
#                   sep='\t')
off.shape

(949695, 175)

In [3]:
# sample on the all attributes
off[off.product_name.notna()]\
    .sample(n=3000, random_state=42)\
    .sort_values('code')\
    .reset_index(drop=True)\
    .to_csv(os.path.join(data_path, 'sample_off_all.csv'), index=False)

In [4]:
# sample on the major attributes
off[off.product_name.notna()]\
    .loc[: ,['product_name',  'main_category_en', 'energy_100g', 
     'fat_100g', 'fiber_100g', 'carbohydrates_100g', 
     'proteins_100g', 'salt_100g', 'sodium_100g', 'sugars_100g']]\
    .sample(n=3000, random_state=42)\
    .sort_values('product_name')\
    .reset_index(drop=True)\
    .to_csv(os.path.join(data_path, 'sample_off_maj.csv'), index=False)

In [5]:
data_path = 'D:\DATA\OurFoods'
df = pd.read_csv(os.path.join(data_path, 'sample_off_maj.csv'))
df.head()

Unnamed: 0,product_name,main_category_en,energy_100g,fat_100g,fiber_100g,carbohydrates_100g,proteins_100g,salt_100g,sodium_100g,sugars_100g
0,.com Premium,Beverages,,,,,,,,
1,0% sin lactosa,Dairies,155.0,0.1,,4.9,4.1,0.1,0.04,4.9
2,10 burgers,,866.0,15.0,,4.0,13.5,1.3,0.52,4.0
3,10 croissants + 10 pains choco,,1481.0,20.0,,36.0,6.0,,,8.0
4,100 % Mie Complet Sans Croûte - Offre économique,Plant-based foods and beverages,1044.0,4.7,4.8,41.2,7.7,1.15062,0.460248,4.3


+ dropped columns
  + `code`, i.e. barcode of the product
  + `url`, i.e. url to the product on website
  + `creator`, i.e. contributor to upload the file
  + `created_t`, `created_datetime`, `last_modified_t`, `last_modified_datetime`
  + `generic_name`, since it's almost identical to `product_name`
+ filtered columns
  + columns of `url`, total = 7
    + regex matching with negative look-arounds `r'^((?!url).)*$'`
    + [Regular expression to match a line that doesn't contain a word](https://stackoverflow.com/questions/406230/regular-expression-to-match-a-line-that-doesnt-contain-a-word)
    + e.g. 'image_url', 'image_small_url', 'image_ingredients_url'

In [6]:
# drop columns not needed for cross-analysis with reviews
dropped_cols = ['creator', 'created_t', 'created_datetime', \
                 'last_modified_t', 'last_modified_datetime', \
                 'generic_name', 'packaging', 'packaging_tags', \
                 'origins', 'origins_tags', \
                 'manufacturing_places', 'manufacturing_places_tags', \
                 'labels', 'emb_codes', 'emb_codes_tags', \
                 'first_packaging_code_geo', 'cities', 'cities_tags', \
                 'purchase_places', 'stores', 'countries', \
                 'ingredients_text', 'traces']
# 'categories',

In [7]:
# drop columns not used for product review
off.drop(dropped_cols, axis=1, inplace=True)
# filter out url columns (columns names containing 'url')
off = off.filter(regex=r'^((?!url).)*$', axis=1)
off.shape

(949695, 145)

In [8]:
# drop the rows without Product Name
off = off[off.product_name.notna()].reset_index(drop=True)
off.shape

(872540, 145)

### Open Food Facts, NA Count

In [9]:
def count_na_pct(column):
    """
    count NA percent in a column, of Open Foods Fact dataset
    """
    return off[column].isna().sum() / off.shape[0] * 100 

In [10]:
na = {}
for col in off.columns:
    na[col] = count_na_pct(col)

In [11]:
# show top 10
sorted(list(na.items()), key=lambda x:x[1])[:10]
# only some attributes having no NA rows

[('code', 0.0),
 ('product_name', 0.0),
 ('pnns_groups_2', 0.0),
 ('states', 0.0),
 ('states_tags', 0.0),
 ('states_en', 0.0),
 ('countries_tags', 0.11243037568478237),
 ('countries_en', 0.11243037568478237),
 ('pnns_groups_1', 1.0553097852247462),
 ('energy_100g', 15.387604006693104)]

In [12]:
count_na_pct('categories')

61.30377976940885

### Open Food Facts subset

#### Take product of "Jif" for example

In [13]:
# product of Jif
jif = off[off.product_name.str.match(r'^(JIF|Jif|jif)\s.*')]
jif.product_name.head()

69980                       Jif Crema de Cacahuate Cremosa
69988               Jif Creme De Amdoim C / Chocolate 450G
69989             Jif Natural Crunchy Peanut Butter Spread
69994    Jif Cookies N Cream and Hazelnut Pate à Tartinner
69997                             Jif Peanut Butter Creamy
Name: product_name, dtype: object

In [15]:
jif.drop('categories', axis=1, inplace=True)
# jif.product_name = jif.product_name.str.lower() # lowercase when extracting tokens
jif.reset_index(drop=True, inplace=True)
jif.shape

(15, 144)

#### Take product of "Cheetos" for example

In [16]:
# Product of Cheetos
cheetos = off[off.product_name.str.match(r'^(Cheetos|CHEETOS|cheetos)\s.*')]
cheetos.drop('categories', axis=1, inplace=True)

# cheetos.product_name = cheetos.product_name.str.lower() 
cheetos.reset_index(drop=True, inplace=True)
cheetos.shape

(78, 144)

In [18]:
cheetos.product_name.unique()

array(['Cheetos Puffs', 'Cheetos Cheese Flavored Snacks Puffs',
       'Cheetos crunchy', 'Cheetos Crunchy Fromage',
       'Cheetos Puffs Large', "Cheetos puffs - Flamin' Hot",
       "Cheetos Cheese Flavored Snacks Crunchy, Flamin' Hot",
       'Cheetos Crunchy', 'Cheetos Crunchy Jalapeno',
       "Cheetos Crunchy Flamin'hot", 'Cheetos puff',
       'Cheetos Crunchy Dangerously Cheese',
       'Cheetos Crunchy Party Size!', "Cheetos Crunchy xxtra Flamin' Hot",
       "Cheetos Crunchy Flamin' Hot Limn",
       'Cheetos Cheddar Jalapeno Crunchy', "Cheetos Crunchy Flamin' Hot",
       'Cheetos Puffs Chese Flavor', "CHEETOS flamin' hot puffs",
       'Cheetos Snowflakes', 'Cheetos puffs', 'Cheetos puffs soufflés',
       'Cheetos jalapeno', 'Cheetos Crunchy Croquant',
       'Cheetos Jagung Bakar', 'Cheetos Crunch',
       'cheetos horneados crunchy sabor a queso', 'Cheetos Sticks',
       'Cheetos Goût Cacahuète', 'Cheetos Goût Fromage', 'Cheetos 21',
       'Cheetos Poulet Epicé', 'Che

## Amazon Reviews: Grocery dataset
+ https://registry.opendata.aws/amazon-reviews/
+ https://s3.amazonaws.com/amazon-reviews-pds/readme.html
+ http://jmcauley.ucsd.edu/data/amazon/

In [26]:
data_path = 'D:\DATA\practice-dataset\gzipped'
amz = pd.read_csv(os.path.join(data_path, 'amazon_reviews_us_Grocery_v1_00.tsv.gz'), \
                  dtype={'customer_id': 'object', 'product_parent': 'object', \
                         'star_rating': 'object', 
                         'helpful_votes': pd.Int64Dtype(), 'total_votes': pd.Int64Dtype()}, \
                  compression='gzip', sep='\t', \
                  error_bad_lines=False, warn_bad_lines=False)
# IF using kaggle server
# data_path = '/kaggle/input/amazon_reviews_us_grocery_v1_00.tsv'
# amz = pd.read_csv(os.path.join(data_path, 'amazon_reviews_us_Grocery_v1_00.tsv'), \
#                   dtype={'customer_id': 'object', 'product_parent': 'object', \
#                          'star_rating': 'object', 
#                          'helpful_votes': 'object', 'total_votes': 'object'}, \
#                   sep='\t', \
#                   error_bad_lines=False, warn_bad_lines=False)

# pd.Int64Dtype() allows NaN

+ dropped: 
  + `marketplace`: all data from US market
  + `product_category`: all data of 'Grocery' category
  + `product_id`: overlapped with `product_parent` on some, 
    + but `product_parent` is more accurate for grouping same product
    + but still cannot differentiate between sizes

In [14]:
amz.drop(['marketplace', 'product_category', 'product_id'], axis=1, inplace=True)
# row 1841896 contains date as star_rating
amz.drop(1841896, axis=0, inplace=True)
amz.shape

(2393378, 12)

### Amazon Review subset

#### Take product of "jif" for example

In [15]:
# Jif product reviews
jif_rev = amz[amz.product_title.str.match(r'^(JIF|Jif|jif)\s.*')]
# jif_rev.product_title = jif_rev.product_title.str.lower()
jif_rev.reset_index(drop=True, inplace=True)
jif_rev.shape

(1413, 12)

In [16]:
jif_rev.product_title.unique()[:10]

array(['Jif Chocolate Nut Butter Almond Granola Bars, 5 Count Pack',
       'Jif Creamy Peanut Butter Granola Bar, 5 Ct',
       'Jif Creamy Peanut Butter Twin Pack, 80 Ounce',
       'Jif Whips Whipped Peanut Butter & Salty Caramel',
       'Jif Cashew Butter, Creamy, 12 Ounce',
       'Jif To Go Dippers with Pretzels, 3 Count',
       'Jif Peanut Powder, 6.5 Ounce', 'Jif Creamy Peanut Butter',
       'Jif Cashew Butter, Crunchy, 12 Ounce', 'Jif Peanut Butter'],
      dtype=object)

#### Take product of "cheetos" for example

In [17]:
# cheetos
che_rev = amz[amz.product_title.str.match(r'^(Cheetos|cheetos|CHEETOS)\s.*')]
# che_rev.product_title = che_rev.product_title.str.lower()
che_rev.reset_index(drop=True, inplace=True)
che_rev.shape

(496, 12)

In [18]:
che_rev.product_title.unique()[:10]

array(['Cheetos Crunchy - 50/1 oz. bags',
       'Cheetos Flavored Snacks, Crunchy Cheese, 1.13 Ounce (Pack of 12)',
       'Cheetos Crunchy Cheddar Jalapeno Cheese Flavored Snacks',
       'Cheetos Crunchy Cheese Flavored Snacks',
       "Cheetos Flamin' Hot and Doritos Dinamita Chile Limon 8.0 Oz [3 Pk]",
       "Cheetos Flamin' Hot - 50/1 oz",
       'Cheetos Sweetos Cinnamon Sugar Puffs Flavored Snacks, 7 oz (Set of 2)',
       "Cheetos Cheese Flavored Snacks, Crunchy Flamin' Hot, 2.38 Ounce (Pack of 12)",
       'Cheetos Cheese Flavored Snacks, Jumbo Puffs, 9.5 Ounce (Pack of 4)',
       'Cheetos Natural White Cheddar Puffs Cheese Flavored Snacks, 8oz Bags (Pack of 12)'],
      dtype=object)