In [1]:
import pandas as pd
import pickle

In [2]:
################################################
################################################
################################################
################################################
# BRANDS DATA #
################################################
################################################
################################################
################################################

In [3]:
## Original JSON files are not formatted ###
## Formatted with process_date.py ##
## Loading newly formatted data ##
with open("/Users/evro/Documents/code/python/fetch/data/cleaned/cleaned_brands.json") as f:
    brand_data = pd.read_json(f)
brands = pd.DataFrame(brand_data)

In [4]:
brands.head()

Unnamed: 0,_id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,{'$oid': '601ac115be37ce2ead437551'},511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,0.0,
1,{'$oid': '601c5460be37ce2ead43755f'},511111519928,Beverages,BEVERAGES,"{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...",Starbucks,0.0,STARBUCKS
2,{'$oid': '601ac142be37ce2ead43755d'},511111819905,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176
3,{'$oid': '601ac142be37ce2ead43755a'},511111519874,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146051,0.0,TEST BRANDCODE @1612366146051
4,{'$oid': '601ac142be37ce2ead43755e'},511111319917,Candy & Sweets,CANDY_AND_SWEETS,"{'$id': {'$oid': '5332fa12e4b03c9a25efd1e7'}, ...",test brand @1612366146827,0.0,TEST BRANDCODE @1612366146827


In [5]:
######################## BRAND DATA NORMALIZE & CLEANING ########################

In [6]:
brands.dtypes

_id              object
barcode           int64
category         object
categoryCode     object
cpg              object
name             object
topBrand        float64
brandCode        object
dtype: object

In [7]:
# Convert user id to string
brands["_id"] = brands["_id"].apply(lambda x: x.get("$oid", str(x)))

In [8]:
# Extract cpg_id and cpg_ref from the nested dictionary in the "cpg" column
brands["cpg_id"] = brands["cpg"].apply(lambda x: x.get("$id", {}).get("$oid") if isinstance(x, dict) else None)
brands["cpg_ref"] = brands["cpg"].apply(lambda x: x.get("$ref") if isinstance(x, dict) else None)

# Drop the original "cpg" column
brands.drop(columns=["cpg"], inplace=True)

In [9]:
# Convert objects to strings
brands = brands.astype({col: "string" for col in brands.select_dtypes(include=["object"]).columns})

In [10]:
brands.head()

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_id,cpg_ref
0,601ac115be37ce2ead437551,511111019862,Baking,BAKING,test brand @1612366101024,0.0,,601ac114be37ce2ead437550,Cogs
1,601c5460be37ce2ead43755f,511111519928,Beverages,BEVERAGES,Starbucks,0.0,STARBUCKS,5332f5fbe4b03c9a25efd0ba,Cogs
2,601ac142be37ce2ead43755d,511111819905,Baking,BAKING,test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176,601ac142be37ce2ead437559,Cogs
3,601ac142be37ce2ead43755a,511111519874,Baking,BAKING,test brand @1612366146051,0.0,TEST BRANDCODE @1612366146051,601ac142be37ce2ead437559,Cogs
4,601ac142be37ce2ead43755e,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,0.0,TEST BRANDCODE @1612366146827,5332fa12e4b03c9a25efd1e7,Cogs


In [11]:
brands.dtypes

_id             string[python]
barcode                  int64
category        string[python]
categoryCode    string[python]
name            string[python]
topBrand               float64
brandCode       string[python]
cpg_id          string[python]
cpg_ref         string[python]
dtype: object

In [12]:
######################## VALIDATE RECEIPT ITEM DATA ########################

In [13]:
count_cols = ["category", "categoryCode", "name", "brandCode", "cpg_id"]
for col in count_cols:
    counts = brands[col].value_counts(dropna=False)
    if not counts.empty:
        print(f"\n🔍 Value counts for '{col}':")
        print(counts.to_string())


🔍 Value counts for 'category':
category
Baking                         369
<NA>                           155
Beer Wine Spirits               90
Snacks                          75
Candy & Sweets                  71
Beverages                       63
Magazines                       44
Health & Wellness               44
Breakfast & Cereal              40
Grocery                         39
Dairy                           33
Condiments & Sauces             27
Frozen                          24
Personal Care                   20
Baby                            18
Canned Goods & Soups            12
Beauty                           9
Cleaning & Home Improvement      6
Deli                             6
Beauty & Personal Care           6
Household                        5
Bread & Bakery                   5
Dairy & Refrigerated             5
Outdoor                          1

🔍 Value counts for 'categoryCode':
categoryCode
<NA>                             650
BAKING                           

In [14]:
# What does this float signify in top brand? Should this be a bool?
brands["topBrand"].value_counts()

topBrand
0.0    524
1.0     31
Name: count, dtype: int64

In [15]:
# I am assuming top brand should be a bool, so I'm converting it.
brands["topBrand"] = brands["topBrand"].astype(bool)

In [16]:
######################## BRANDS DATA ISSUES ########################

In [17]:
# Where are the nulls
brands.isnull().sum()

_id               0
barcode           0
category        155
categoryCode    650
name              0
topBrand          0
brandCode       234
cpg_id            0
cpg_ref           0
dtype: int64

In [18]:
# Are there any dups?
dup_cols = ["_id", "barcode", "name", "brandCode", "cpg_id", "categoryCode"]
for col in dup_cols:
    dups = brands[brands.duplicated(subset=[col], keep=False)]
    if not dups.empty:
        print(f"\n🔍 Duplicate records for '{col}':")
        display(dups)



🔍 Duplicate records for 'barcode':


Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_id,cpg_ref
9,5c408e8bcd244a1fdb47aee7,511111504788,Baking,,test,True,TEST,59ba6f1ce4b092b29c167346,Cogs
20,5c4699f387ff3577e203ea29,511111305125,Baby,,Chris Image Test,True,CHRISIMAGE,55b62995e4b0d8e685c14213,Cogs
129,5a7e0604e4b0aedb3b84afd3,511111504139,Beverages,,Chris Brand XYZ,True,CHRISXYZ,55b62995e4b0d8e685c14213,Cogs
152,5c45f91b87ff3552f950f027,511111204923,Grocery,,Brand1,True,0987654321,5c45f8b087ff3552f950f026,Cogs
194,5d6415d5a3a018514994f429,511111605058,Magazines,,Health Magazine,True,511111605058,5d5d4fd16d5f3b23d1bc7905,Cogs
299,5a8c33f3e4b07f0a2dac8943,511111504139,Grocery,,Pace,False,PACE,5a734034e4b0d58f376be874,Cogs
412,5ccb2ece166eb31bbbadccbe,511111504788,Condiments & Sauces,,The Pioneer Woman,True,PIONEER WOMAN,559c2234e4b06aca36af13c6,Cogs
467,5c409ab4cd244a3539b84162,511111004790,Baking,,alexa,True,ALEXA,55b62995e4b0d8e685c14213,Cogs
536,5d6027f46d5f3b23d1bc7906,511111204923,Snacks,,CHESTER'S,True,CHESTERS,5332f5fbe4b03c9a25efd0ba,Cogs
651,5d642d65a3a018514994f42d,511111305125,Magazines,,Rachael Ray Everyday,True,511111305125,5d5d4fd16d5f3b23d1bc7905,Cogs



🔍 Duplicate records for 'name':


Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_id,cpg_ref
64,5da609991dda2c3e1416ae90,511111805854,Health & Wellness,,ONE A DAY® WOMENS,False,511111805854,53e10d6368abd3c7065097cc,Cogs
126,5bd201a990fa074576779a19,511111104698,Baby,,Pull-Ups,False,PULL UPS,550b2565e4b001d5e9e4146f,Cogs
140,5a4d23dae4b0bcb2c74ea77e,511111000518,Beverages,,Caleb's Kola,False,CALEB'S KOLA,5332f5fbe4b03c9a25efd0ba,Cogs
176,592486bee410d61fcea3d12d,511111700814,Dairy,,I CAN'T BELIEVE IT'S NOT BUTTER!,True,I CAN'T BELIEVE IT'S NOT BUTTER!,53e10d6368abd3c7065097cc,Cogs
194,5d6415d5a3a018514994f429,511111605058,Magazines,,Health Magazine,True,511111605058,5d5d4fd16d5f3b23d1bc7905,Cogs
282,5332f608e4b03c9a25efd0c1,511111903901,,,Sierra Mist,True,,53e10d6368abd3c7065097cc,Cpgs
339,5e5ff265ee7f2d0b35b2a18f,511111914051,Health & Wellness,,ONE A DAY® WOMENS,True,ONE A DAY® WOMENS,53e10d6368abd3c7065097cc,Cogs
477,5bcdfc5a965c7d66d92731e9,511111304616,Beverages,,V8 Hydrate,True,,53e10d6368abd3c7065097cc,Cogs
574,5d9d08d1a60b87376833e348,511111605546,Snacks,,Baken-Ets,True,BAKEN ETS,5332f5fbe4b03c9a25efd0ba,Cogs
596,5f298852be37ce7958c5952d,511111915287,Magazines,MAGAZINES,Health Magazine,True,HEALTH,5d66b9dcee7f2d201c7281cd,Cogs



🔍 Duplicate records for 'brandCode':


Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_id,cpg_ref
0,601ac115be37ce2ead437551,511111019862,Baking,BAKING,test brand @1612366101024,False,,601ac114be37ce2ead437550,Cogs
11,57c08106e4b0718ff5fcb02c,511111102540,,,MorningStar,True,,5332f5f2e4b03c9a25efd0aa,Cpgs
18,5fb28549be37ce522e165cb5,511111317364,Baking,BAKING,test brand @1605535049181,False,,5fb28549be37ce522e165cb4,Cogs
23,5332f5fee4b03c9a25efd0bd,511111303947,,,Bottled Starbucks,True,,53e10d6368abd3c7065097cc,Cpgs
24,5332fa7ce4b03c9a25efd22e,511111802914,,,Full Throttle,True,,5332f5ebe4b03c9a25efd0a8,Cpgs
...,...,...,...,...,...,...,...,...,...
1144,57c08242e4b0718ff5fcb032,511111202516,,,Corona,True,,5332f7a7e4b03c9a25efd134,Cpgs
1146,5332fa12e4b03c9a25efd1e6,511111703105,,,Bellatoria,True,,5332fa12e4b03c9a25efd1e7,Cpgs
1157,5332fa75e4b03c9a25efd221,511111303015,,,DASANI,True,,5332f5ebe4b03c9a25efd0a8,Cpgs
1159,585a96cbe4b03e62d1ce0e88,511111501619,Beverages,,Pepsi Max,False,,5332f5fbe4b03c9a25efd0ba,Cogs



🔍 Duplicate records for 'cpg_id':


Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_id,cpg_ref
1,601c5460be37ce2ead43755f,511111519928,Beverages,BEVERAGES,Starbucks,False,STARBUCKS,5332f5fbe4b03c9a25efd0ba,Cogs
2,601ac142be37ce2ead43755d,511111819905,Baking,BAKING,test brand @1612366146176,False,TEST BRANDCODE @1612366146176,601ac142be37ce2ead437559,Cogs
3,601ac142be37ce2ead43755a,511111519874,Baking,BAKING,test brand @1612366146051,False,TEST BRANDCODE @1612366146051,601ac142be37ce2ead437559,Cogs
4,601ac142be37ce2ead43755e,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,TEST BRANDCODE @1612366146827,5332fa12e4b03c9a25efd1e7,Cogs
5,601ac142be37ce2ead43755b,511111719885,Baking,BAKING,test brand @1612366146091,False,TEST BRANDCODE @1612366146091,601ac142be37ce2ead437559,Cogs
...,...,...,...,...,...,...,...,...,...
1160,5887a216e4b02187f85cdad5,511111401155,Deli,,Claussen,False,CLAUSSEN,559c2234e4b06aca36af13c6,Cogs
1161,5332f709e4b03c9a25efd0f2,511111403845,Beer Wine Spirits,,Blue Moon,False,BLUE MOON,5332f709e4b03c9a25efd0f1,Cogs
1163,5dc1fca91dda2c0ad7da64ae,511111706328,Breakfast & Cereal,,Dippin Dots® Cereal,True,DIPPIN DOTS CEREAL,53e10d6368abd3c7065097cc,Cogs
1164,5f494c6e04db711dd8fe87e7,511111416173,Candy & Sweets,CANDY_AND_SWEETS,test brand @1598639215217,True,TEST BRANDCODE @1598639215217,5332fa12e4b03c9a25efd1e7,Cogs



🔍 Duplicate records for 'categoryCode':


Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_id,cpg_ref
0,601ac115be37ce2ead437551,511111019862,Baking,BAKING,test brand @1612366101024,False,,601ac114be37ce2ead437550,Cogs
2,601ac142be37ce2ead43755d,511111819905,Baking,BAKING,test brand @1612366146176,False,TEST BRANDCODE @1612366146176,601ac142be37ce2ead437559,Cogs
3,601ac142be37ce2ead43755a,511111519874,Baking,BAKING,test brand @1612366146051,False,TEST BRANDCODE @1612366146051,601ac142be37ce2ead437559,Cogs
4,601ac142be37ce2ead43755e,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,TEST BRANDCODE @1612366146827,5332fa12e4b03c9a25efd1e7,Cogs
5,601ac142be37ce2ead43755b,511111719885,Baking,BAKING,test brand @1612366146091,False,TEST BRANDCODE @1612366146091,601ac142be37ce2ead437559,Cogs
...,...,...,...,...,...,...,...,...,...
1162,5f77274dbe37ce6b592e90c0,511111116752,Baking,BAKING,test brand @1601644365844,True,,5f77274dbe37ce6b592e90bf,Cogs
1163,5dc1fca91dda2c0ad7da64ae,511111706328,Breakfast & Cereal,,Dippin Dots® Cereal,True,DIPPIN DOTS CEREAL,53e10d6368abd3c7065097cc,Cogs
1164,5f494c6e04db711dd8fe87e7,511111416173,Candy & Sweets,CANDY_AND_SWEETS,test brand @1598639215217,True,TEST BRANDCODE @1598639215217,5332fa12e4b03c9a25efd1e7,Cogs
1165,5a021611e4b00efe02b02a57,511111400608,Grocery,,LIPTON TEA Leaves,False,LIPTON TEA Leaves,5332f5f6e4b03c9a25efd0b4,Cogs


In [19]:
# Count the number of missing category codes for each category.
# Identifies categories where category codes are frequently missing indicating inconsistencies in data entry.
brands.groupby("category")["categoryCode"].apply(lambda x: x.isnull().sum()).sort_values(ascending=False)

category
Snacks                         75
Beverages                      62
Beer Wine Spirits              59
Magazines                      43
Breakfast & Cereal             40
Dairy                          33
Health & Wellness              30
Grocery                        28
Condiments & Sauces            27
Frozen                         23
Personal Care                  16
Canned Goods & Soups           12
Baby                           11
Baking                         10
Beauty                          9
Deli                            6
Beauty & Personal Care          6
Household                       5
Cleaning & Home Improvement     0
Candy & Sweets                  0
Dairy & Refrigerated            0
Bread & Bakery                  0
Outdoor                         0
Name: categoryCode, dtype: int64

In [20]:
# Count the number of missing brand codes for each category.
# Identifies categories where brand codes are frequently missing indicating inconsistencies in data entry.
brands.groupby("category")["brandCode"].apply(lambda x: x.isnull().sum()).sort_values(ascending=False)

category
Baking                         73
Snacks                          3
Beverages                       3
Dairy                           2
Magazines                       1
Grocery                         1
Dairy & Refrigerated            0
Personal Care                   0
Outdoor                         0
Household                       0
Health & Wellness               0
Frozen                          0
Deli                            0
Baby                            0
Cleaning & Home Improvement     0
Canned Goods & Soups            0
Candy & Sweets                  0
Breakfast & Cereal              0
Bread & Bakery                  0
Beer Wine Spirits               0
Beauty & Personal Care          0
Beauty                          0
Condiments & Sauces             0
Name: brandCode, dtype: int64

In [21]:
import os

output_dir = "/Users/evro/Documents/code/python/fetch/data/validated"

os.makedirs(output_dir, exist_ok=True)

# Save the pickle file in the specified directory
output_file_pkl = os.path.join(output_dir, "brands.pkl")
output_file_csv = os.path.join(output_dir, "brands.csv")

with open(output_file_pkl, "wb") as f:
    pickle.dump(brands, f)

with open(output_file_csv, "w") as f:
    brands.to_csv(f, index=False)