In [86]:
import csv
import pandas as pd
import re
import sqlite3
from sqlite3 import Error
import ast 
import math
import os
import numpy as np

In [87]:
#Original datasets -df
receipts_df = pd.read_json('receipts.json', lines=True)
brands_df = pd.read_json('brands.json', lines=True)
users_df = pd.read_json('users.json', lines=True)

In [101]:
#Cleaned datasets -cdf
receipts_cdf = pd.read_csv("cleaned_receipts.csv")
users_cdf = pd.read_csv("cleaned_users.csv")
brands_cdf = pd.read_csv("cleaned_brands.csv")

### Receipts

In [111]:
receipts_df.head()

Unnamed: 0,_id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,{'$oid': '5ff1e1eb0a720f0523000575'},500.0,"Receipt number 2 completed, bonus point schedu...",{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687536000},{'$date': 1609687531000},500.0,{'$date': 1609632000000},5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,{'$oid': '5ff1e1bb0a720f052300056b'},150.0,"Receipt number 5 completed, bonus point schedu...",{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687488000},{'$date': 1609687483000},150.0,{'$date': 1609601083000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052
2,{'$oid': '5ff1e1f10a720f052300057a'},5.0,All-receipts receipt bonus,{'$date': 1609687537000},{'$date': 1609687537000},,{'$date': 1609687542000},,5.0,{'$date': 1609632000000},1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b
3,{'$oid': '5ff1e1ee0a7214ada100056f'},5.0,All-receipts receipt bonus,{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687539000},{'$date': 1609687534000},5.0,{'$date': 1609632000000},4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6
4,{'$oid': '5ff1e1d20a7214ada1000561'},5.0,All-receipts receipt bonus,{'$date': 1609687506000},{'$date': 1609687506000},{'$date': 1609687511000},{'$date': 1609687511000},{'$date': 1609687506000},5.0,{'$date': 1609601106000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052


#### 1): Check how many null values are in each column

Noticed that for **"rewardsReciptItemList"** column, which were used to store items information, **had a lot of null values (440)**; this might be an issue when analyze the item counts/price/information etc.

In [88]:
receipts_df.isna().sum()

_id                          0
bonusPointsEarned          575
bonusPointsEarnedReason    575
createDate                   0
dateScanned                  0
finishedDate               551
modifyDate                   0
pointsAwardedDate          582
pointsEarned               510
purchaseDate               448
purchasedItemCount         484
rewardsReceiptItemList     440
rewardsReceiptStatus         0
totalSpent                 435
userId                       0
dtype: int64

We can see that 434 were related to status of "submitted", **and 6 were related to others (which I guess they suppose to have an item list but do not)**

In [89]:
receipts_df[receipts_df['rewardsReceiptItemList'].isnull()].groupby('rewardsReceiptStatus').size()

rewardsReceiptStatus
FINISHED       2
PENDING        1
REJECTED       3
SUBMITTED    434
dtype: int64

In [90]:
conn = sqlite3.connect("receipts.db")
cursor = conn.cursor()

#### 2): Check missing information in rewardsReceiptItemList column

If we take a closer look at item within item list, there are a few issue raised when creating the relational database schema:
- 1): item has different keys combination within it's dictionary, **not all item has a barcode (missing unique itentifier for some items)** thus some item will not be found in the item table if it doesn't have a barcode
- 2): **not all item has a brand code, 148 items missing brand code even after I use "description" to map some of the brand code to the item table**, moreover, since brand code is not the unique itentifier in brand table (brand id is), I will need to use brand code to find brand id in order to connect these 2 tables, but brand code might corresponding to different brand ids
    - this might casue issues like lower item counts by brand code

In [91]:
cursor.execute("""
select count(*) 
from item
where brand_code is null
""")
answer = cursor.fetchall()
answer

[(148,)]

### Brands

In [112]:
brands_df.head()

Unnamed: 0,_id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,{'$oid': '601ac115be37ce2ead437551'},511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,0.0,
1,{'$oid': '601c5460be37ce2ead43755f'},511111519928,Beverages,BEVERAGES,"{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...",Starbucks,0.0,STARBUCKS
2,{'$oid': '601ac142be37ce2ead43755d'},511111819905,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176
3,{'$oid': '601ac142be37ce2ead43755a'},511111519874,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146051,0.0,TEST BRANDCODE @1612366146051
4,{'$oid': '601ac142be37ce2ead43755e'},511111319917,Candy & Sweets,CANDY_AND_SWEETS,"{'$id': {'$oid': '5332fa12e4b03c9a25efd1e7'}, ...",test brand @1612366146827,0.0,TEST BRANDCODE @1612366146827


If we take a closer look into brand table, **234 brandCode were missing**, **35 were stored nothing/space as brand so will be cleaned as null value in cleaning step 1, total were 269 missing**. This was related to the item issue mentioned previously

In [92]:
brands_df['brandCode'].value_counts()

                                         35
GOODNITES                                 2
HUGGIES                                   2
WHEATIES                                  1
ALKA SELTZER PLUS SINUS CAP/ GEL/ TAB     1
                                         ..
TEST BRANDCODE @1610058181549             1
TEST BRANDCODE @1598716509358             1
TEST BRANDCODE @1610049748118             1
GOLDEN GRAHAMS                            1
ABSOLUT® ORIGINAL                         1
Name: brandCode, Length: 897, dtype: int64

In [93]:
cursor.execute(f"""
select count(brand_id)
from brand
where brand_code is null
""")
answer = cursor.fetchall()
answer

[(269,)]

brand id is unique identifier, same brand name might has multiple brand ids, same barcode might has multiple brand ids, this might be very confusing when join to another table to find brand

In [110]:
cursor.execute(f"""
select count(distinct(brand_id)), count(distinct(barcode)), count(distinct(name)), count(distinct(brand_code))
from brand
""")
answer = cursor.fetchall()
answer

[(1167, 1160, 1156, 896)]

### Users

In [114]:
users_df.head()

Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
1,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
2,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
3,{'$oid': '5ff1e1eacfcf6c399c274ae6'},True,{'$date': 1609687530554},{'$date': 1609687530597},consumer,Email,WI
4,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI


Over 200 duplicated rows in users table, cleaned in step 1 data notebook

In [99]:
users_df.count()

_id             495
active          495
createdDate     495
lastLogin       433
role            495
signUpSource    447
state           439
dtype: int64

In [100]:
users_cdf.count()

_id             212
active          212
createdDate     212
lastLogin       172
role            212
signUpSource    207
state           206
dtype: int64

**around 148 receipt** doesn't have a corresponding user in user table

In [105]:
cursor.execute(f"""
select count(receipt_id)
from receipt r
left join user u
on r.user_id = u.user_id
where u.user_id is null
""")
answer = cursor.fetchall()
answer

[(148,)]