#### Imports

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np

#### Load Data

In [3]:
users = pd.read_json('./data/users.json/users.json', lines=True)
brands = pd.read_json('./data/brands.json/brands.json', lines=True)
reciepts = pd.read_json('./data/receipts.json/receipts.json', lines=True)

#### Users Dataset Analysis

In [4]:
#Explore users dataset on  a high level
users.head(5)

Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
1,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
2,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
3,{'$oid': '5ff1e1eacfcf6c399c274ae6'},True,{'$date': 1609687530554},{'$date': 1609687530597},consumer,Email,WI
4,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI


In [5]:
users.dtypes  

_id             object
active            bool
createdDate     object
lastLogin       object
role            object
signUpSource    object
state           object
dtype: object

In [6]:
users.shape

(495, 7)

In [7]:
non_dic_columns=['active','role','signUpSource','state']

print("Unique values in dict columns:")
for col in non_dic_columns:
    print(col,'---', users[col].unique())

Unique values in dict columns:
active --- [ True False]
role --- ['consumer' 'fetch-staff']
signUpSource --- ['Email' 'Google' nan]
state --- ['WI' 'KY' 'AL' 'CO' 'IL' nan 'OH' 'SC' 'NH']


In [31]:
def convert_to_string(val):
    return str(val)

users['id'] = users['_id'].apply(convert_to_string)

print('Unique user ids:', users.id.nunique())
print('Total user ids:', len(users.id))


Unique user ids: 212
Total user ids: 495


#### Observations Users Dataset:
    
1. Documentation says-  users.`role: constant value set to 'CONSUMER'`. But 2 unique values found in dataset.
2. SignUpSource and State has nan values. #### Is this important?
3. The users table should only have unique user ids to avoid redundancy of data and efficient qquerying.
4. All dates are a numeric value. Unknown if dates were lost in conversion or dummy data.

#### Reciepts Dataset Analysis

In [7]:
#Explore reciepts dataset on  a high level
reciepts.head(5)

Unnamed: 0,_id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,{'$oid': '5ff1e1eb0a720f0523000575'},500.0,"Receipt number 2 completed, bonus point schedu...",{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687536000},{'$date': 1609687531000},500.0,{'$date': 1609632000000},5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,{'$oid': '5ff1e1bb0a720f052300056b'},150.0,"Receipt number 5 completed, bonus point schedu...",{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687488000},{'$date': 1609687483000},150.0,{'$date': 1609601083000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052
2,{'$oid': '5ff1e1f10a720f052300057a'},5.0,All-receipts receipt bonus,{'$date': 1609687537000},{'$date': 1609687537000},,{'$date': 1609687542000},,5.0,{'$date': 1609632000000},1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b
3,{'$oid': '5ff1e1ee0a7214ada100056f'},5.0,All-receipts receipt bonus,{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687539000},{'$date': 1609687534000},5.0,{'$date': 1609632000000},4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6
4,{'$oid': '5ff1e1d20a7214ada1000561'},5.0,All-receipts receipt bonus,{'$date': 1609687506000},{'$date': 1609687506000},{'$date': 1609687511000},{'$date': 1609687511000},{'$date': 1609687506000},5.0,{'$date': 1609601106000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052


In [8]:
reciepts.dtypes  

_id                         object
bonusPointsEarned          float64
bonusPointsEarnedReason     object
createDate                  object
dateScanned                 object
finishedDate                object
modifyDate                  object
pointsAwardedDate           object
pointsEarned               float64
purchaseDate                object
purchasedItemCount         float64
rewardsReceiptItemList      object
rewardsReceiptStatus        object
totalSpent                 float64
userId                      object
dtype: object

In [9]:
reciepts.shape

(1119, 15)

In [10]:
reciepts.rewardsReceiptStatus.unique()

array(['FINISHED', 'REJECTED', 'FLAGGED', 'SUBMITTED', 'PENDING'],
      dtype=object)

#### Observations

1. Some queries in Part 2 are based on `Status = 'Accepted'`. There is no such data in the dataset. Might be an data anomaly.
2. Multiple date columns. Based on what they represent and business cases some can may be dropped to reduce redundancy. <br>
e.g. createDate: The date that the event was created <br>
     purchaseDate: the date of the purchase<br>
In real world, most likely these should be the same date.



In [11]:
reciepts.bonusPointsEarnedReason.value_counts()

All-receipts receipt bonus                                                             183
Receipt number 1 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)    119
COMPLETE_NONPARTNER_RECEIPT                                                             71
COMPLETE_PARTNER_RECEIPT                                                                39
Receipt number 3 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)     31
Receipt number 2 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)     30
Receipt number 5 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)     27
Receipt number 4 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)     26
Receipt number 6 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)     18
Name: bonusPointsEarnedReason, dtype: int64

In [12]:
df_temp =reciepts[['bonusPointsEarnedReason','bonusPointsEarned','pointsEarned']]
df_temp = df_temp.drop_duplicates()
df_temp[df_temp['bonusPointsEarned'] == df_temp['pointsEarned']]

Unnamed: 0,bonusPointsEarnedReason,bonusPointsEarned,pointsEarned
0,"Receipt number 2 completed, bonus point schedu...",500.0,500.0
1,"Receipt number 5 completed, bonus point schedu...",150.0,150.0
2,All-receipts receipt bonus,5.0,5.0
5,"Receipt number 1 completed, bonus point schedu...",750.0,750.0
9,"Receipt number 3 completed, bonus point schedu...",250.0,250.0
10,"Receipt number 6 completed, bonus point schedu...",100.0,100.0
14,"Receipt number 4 completed, bonus point schedu...",300.0,300.0
47,COMPLETE_NONPARTNER_RECEIPT,25.0,25.0


In [13]:
df_temp[df_temp['bonusPointsEarned'] != df_temp['pointsEarned']]

Unnamed: 0,bonusPointsEarnedReason,bonusPointsEarned,pointsEarned
12,"Receipt number 5 completed, bonus point schedu...",150.0,8850.0
15,,,
20,"Receipt number 4 completed, bonus point schedu...",300.0,389.2
23,COMPLETE_PARTNER_RECEIPT,40.0,185.0
27,COMPLETE_NONPARTNER_RECEIPT,25.0,35.0
...,...,...,...
643,,,209.8
664,,,210.0
667,,,209.5
917,"Receipt number 1 completed, bonus point schedu...",750.0,789.2


##### Comments: 

A clear understanding of how the data is saved to `bonusPointsEarned` and `pointsEarned` cannot be gained. Sometimes the value is same and sometimes it is not. There could possibly be a better way to store this data.

In [33]:
reciepts.rewardsReceiptItemList

0       [{'barcode': '4011', 'description': 'ITEM NOT ...
1       [{'barcode': '4011', 'description': 'ITEM NOT ...
2       [{'needsFetchReview': False, 'partnerItemId': ...
3       [{'barcode': '4011', 'description': 'ITEM NOT ...
4       [{'barcode': '4011', 'description': 'ITEM NOT ...
                              ...                        
1114    [{'barcode': 'B076FJ92M4', 'description': 'mue...
1115                                                  NaN
1116                                                  NaN
1117    [{'barcode': 'B076FJ92M4', 'description': 'mue...
1118                                                  NaN
Name: rewardsReceiptItemList, Length: 1119, dtype: object

##### Comments :

reciepts violates 1NF due to presence of nested data/array. Hence it is not a relational model.
Relation Decomposition is necessary to convert it into a relational model.

##### Test data integrity of data

In [14]:
#udf for aggregating total finalPrice of each item in reciept item list of a given reciept

def agg_finalPrice_receipt_itms(row):
    columns = ['barcode','description','finalPrice','quantityPurchased']
    df_rct_itms = pd.DataFrame(columns = columns)
    data = []

    for dic in row['rewardsReceiptItemList']:
#             for dic in item:
                values =[dic.get('barcode')
                        ,dic.get('description')
                        ,float(dic.get('finalPrice'))
                        ,dic.get('quantityPurchased')]
                zipped = zip(columns, values)
                a_dictionary = dict(zipped)
                data.append(a_dictionary)

    df_rct_itms = df_rct_itms.append(data, True)
    return [row.totalSpent,round(df_rct_itms.finalPrice.sum(),2)]

In [15]:
reciepts.iloc[[423]].apply(agg_finalPrice_receipt_itms, axis=1)
reciepts.iloc[[650]].apply(agg_finalPrice_receipt_itms, axis=1)

423    [1183.1, 1183.1]
dtype: object

650    [28.0, 28.0]
dtype: object

##### Noteworthy:
This code could be further enhanced to make a mapping of reciept index, totalSpent, sum(finalPriceofRctItems) on entire dataset

In [16]:
reciepts.apply(agg_finalPrice_receipt_itms, axis=1)

TypeError: float() argument must be a string or a number, not 'NoneType'

In [24]:
for item in reciepts.iloc[[423]].rewardsReceiptItemList:
     for dic in item:
        print([k  for  k in  dic.keys()])

['barcode', 'brandCode', 'description', 'discountedItemPrice', 'finalPrice', 'itemPrice', 'metabriteCampaignId', 'originalReceiptItemText', 'partnerItemId', 'pointsEarned', 'pointsPayerId', 'quantityPurchased', 'rewardsGroup', 'rewardsProductPartnerId']
['description', 'discountedItemPrice', 'finalPrice', 'itemPrice', 'originalReceiptItemText', 'partnerItemId', 'quantityPurchased']
['description', 'discountedItemPrice', 'finalPrice', 'itemPrice', 'originalReceiptItemText', 'partnerItemId', 'quantityPurchased']
['barcode', 'brandCode', 'description', 'discountedItemPrice', 'finalPrice', 'itemPrice', 'metabriteCampaignId', 'originalReceiptItemText', 'partnerItemId', 'pointsNotAwardedReason', 'pointsPayerId', 'quantityPurchased', 'rewardsGroup', 'rewardsProductPartnerId']
['barcode', 'brandCode', 'description', 'discountedItemPrice', 'finalPrice', 'itemPrice', 'metabriteCampaignId', 'originalReceiptItemText', 'partnerItemId', 'pointsNotAwardedReason', 'pointsPayerId', 'quantityPurchased',

##### Comments: 
As seen from previous code, recieptItemList has non-uniform dictionaries. <br>

Item Number which could be potential identifier for each reciept list item is sparsely populated. This was observed in Tableau.

Some have barcode, finalPrice while some don't making it difficult to check data integrity among <br>
sum of individual items in recieptItemList and compare against totalSpent in reciept.

This data issue needs to be first taken care of before futher enhancing the user defined function <br>
to conduct data integrity checks.

Similar checks can be designed for item quantity at receipt and receipt item levels too.