In [11]:
import pandas as pd
import pickle

In [12]:
with open("/Users/evro/Documents/code/python/fetch/data/validated/users.pkl", "rb") as f:
    users = pickle.load(f)

In [13]:
################################################
################################################
################################################
################################################
# RECEIPT DATA #
################################################
################################################
################################################
################################################

In [14]:
## Original JSON files are not formatted ###
## Formatted with process_date.py ##
## Loading newly formatted data ##
with open("/Users/evro/Documents/code/python/fetch/data/cleaned/cleaned_receipts.json") as f:
    receipt_data = pd.read_json(f)
receipt = pd.DataFrame(receipt_data)

In [15]:
receipt.head()

Unnamed: 0,receipt_id,user_id,purchase_date,date_scanned,create_date,finished_date,modify_date,rewards_receipt_status,bonus_points_earned,points_awarded_date,points_earned,purchased_item_count,total_spent,bonus_points_earned_reason
0,5ff1e1eb0a720f0523000575,,1609632000000.0,1609687531000,1609687531000,1609688000000.0,1609687536000,FINISHED,500,1609688000000.0,500.0,5,26.0,"Receipt number 2 completed, bonus point schedu..."
1,5ff1e1bb0a720f052300056b,,1609601000000.0,1609687483000,1609687483000,1609687000000.0,1609687488000,FINISHED,150,1609687000000.0,150.0,2,11.0,"Receipt number 5 completed, bonus point schedu..."
2,5ff1e1f10a720f052300057a,,1609632000000.0,1609687537000,1609687537000,,1609687542000,REJECTED,5,,5.0,1,10.0,All-receipts receipt bonus
3,5ff1e1ee0a7214ada100056f,,1609632000000.0,1609687534000,1609687534000,1609688000000.0,1609687539000,FINISHED,5,1609688000000.0,5.0,4,28.0,All-receipts receipt bonus
4,5ff1e1d20a7214ada1000561,,1609601000000.0,1609687506000,1609687506000,1609688000000.0,1609687511000,FINISHED,5,1609688000000.0,5.0,2,1.0,All-receipts receipt bonus


In [16]:
######################## RECEIPT DATA NORMALIZE & CLEANING ########################

In [17]:
receipt.dtypes

receipt_id                     object
user_id                       float64
purchase_date                 float64
date_scanned                    int64
create_date                     int64
finished_date                 float64
modify_date                     int64
rewards_receipt_status         object
bonus_points_earned             int64
points_awarded_date           float64
points_earned                 float64
purchased_item_count            int64
total_spent                   float64
bonus_points_earned_reason     object
dtype: object

In [18]:
# Convert objects to strings
receipt = receipt.astype({col: "string" for col in receipt.select_dtypes(include=["object"]).columns})

In [19]:
# Convert timestamp fields to standardized datetime format
timestamp_cols = ["purchase_date", "date_scanned", "create_date",
                  "finished_date", "modify_date", "points_awarded_date"]

for col in timestamp_cols:
    receipt[col] = pd.to_datetime(receipt[col], unit='ms', errors='coerce', utc=True)

In [20]:
# Convert total_spent to numeric type
receipt["total_spent"] = pd.to_numeric(receipt["total_spent"], errors='coerce')

In [21]:
# Validate cleaning
receipt.dtypes

receipt_id                         string[python]
user_id                                   float64
purchase_date                 datetime64[ns, UTC]
date_scanned                  datetime64[ns, UTC]
create_date                   datetime64[ns, UTC]
finished_date                 datetime64[ns, UTC]
modify_date                   datetime64[ns, UTC]
rewards_receipt_status             string[python]
bonus_points_earned                         int64
points_awarded_date           datetime64[ns, UTC]
points_earned                             float64
purchased_item_count                        int64
total_spent                               float64
bonus_points_earned_reason         string[python]
dtype: object

In [22]:
receipt["points_earned"] = receipt["points_earned"].astype(int)

In [23]:
receipt["user_id"] = receipt["user_id"].astype("string")

In [24]:
# Validate cleaning
receipt.dtypes

receipt_id                         string[python]
user_id                            string[python]
purchase_date                 datetime64[ns, UTC]
date_scanned                  datetime64[ns, UTC]
create_date                   datetime64[ns, UTC]
finished_date                 datetime64[ns, UTC]
modify_date                   datetime64[ns, UTC]
rewards_receipt_status             string[python]
bonus_points_earned                         int64
points_awarded_date           datetime64[ns, UTC]
points_earned                               int64
purchased_item_count                        int64
total_spent                               float64
bonus_points_earned_reason         string[python]
dtype: object

In [25]:
receipt.sort_values(by="date_scanned", ascending=False).head(10)


Unnamed: 0,receipt_id,user_id,purchase_date,date_scanned,create_date,finished_date,modify_date,rewards_receipt_status,bonus_points_earned,points_awarded_date,points_earned,purchased_item_count,total_spent,bonus_points_earned_reason
1095,603d760e0a720fde1000048e,,NaT,2021-03-01 23:17:34.772000+00:00,2021-03-01 23:17:34.772000+00:00,NaT,2021-03-01 23:17:34.772000+00:00,SUBMITTED,0,NaT,0,0,0.0,
1094,603d5d6c0a7217c72c000463,,2020-08-17 00:00:00+00:00,2021-03-01 21:32:28+00:00,2021-03-01 21:32:28+00:00,NaT,2021-03-01 21:32:29+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT
1096,603d59e70a7217c72c00045f,,2020-08-17 00:00:00+00:00,2021-03-01 21:17:27+00:00,2021-03-01 21:17:27+00:00,NaT,2021-03-01 21:17:28+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT
1099,603d40250a720fde10000459,,2020-08-17 00:00:00+00:00,2021-03-01 19:27:33+00:00,2021-03-01 19:27:33+00:00,NaT,2021-03-01 19:27:34+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT
1092,603d30e60a7217c72c00043f,,2020-08-17 00:00:00+00:00,2021-03-01 18:22:30+00:00,2021-03-01 18:22:30+00:00,NaT,2021-03-01 18:22:31+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT
1107,603d28b60a720fde10000445,,NaT,2021-03-01 17:47:34.867000+00:00,2021-03-01 17:47:34.867000+00:00,NaT,2021-03-01 17:47:34.867000+00:00,SUBMITTED,0,NaT,0,0,0.0,
1115,603d0b710a720fde1000042a,,NaT,2021-03-01 15:42:41.873000+00:00,2021-03-01 15:42:41.873000+00:00,NaT,2021-03-01 15:42:41.873000+00:00,SUBMITTED,0,NaT,0,0,0.0,
1116,603cf5290a720fde10000413,,NaT,2021-03-01 14:07:37.664000+00:00,2021-03-01 14:07:37.664000+00:00,NaT,2021-03-01 14:07:37.664000+00:00,SUBMITTED,0,NaT,0,0,0.0,
1104,603cf2ce0a7217c72c000413,,NaT,2021-03-01 13:57:34.396000+00:00,2021-03-01 13:57:34.396000+00:00,NaT,2021-03-01 13:57:34.396000+00:00,SUBMITTED,0,NaT,0,0,0.0,
1117,603ce7100a7217c72c000405,,2020-08-17 00:00:00+00:00,2021-03-01 13:07:28+00:00,2021-03-01 13:07:28+00:00,NaT,2021-03-01 13:07:29+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT


In [26]:
######################## VALIDATE RECEIPT DATA ########################


In [27]:
receipt.isnull().sum()


receipt_id                       0
user_id                       1119
purchase_date                  448
date_scanned                     0
create_date                      0
finished_date                  551
modify_date                      0
rewards_receipt_status           0
bonus_points_earned              0
points_awarded_date            582
points_earned                    0
purchased_item_count             0
total_spent                      0
bonus_points_earned_reason       0
dtype: int64

In [28]:
receipt["rewards_receipt_status"].value_counts(dropna=False)


rewards_receipt_status
FINISHED     518
SUBMITTED    434
REJECTED      71
PENDING       50
FLAGGED       46
Name: count, dtype: Int64

In [29]:
######################## RECEIPT DATA ISSUES ########################

In [30]:
# See which records are missing purchase_date
receipt[receipt["purchase_date"].isnull()]

Unnamed: 0,receipt_id,user_id,purchase_date,date_scanned,create_date,finished_date,modify_date,rewards_receipt_status,bonus_points_earned,points_awarded_date,points_earned,purchased_item_count,total_spent,bonus_points_earned_reason
15,5ff1e1e90a7214ada1000569,,NaT,2021-01-03 15:25:29+00:00,2021-01-03 15:25:29+00:00,NaT,2021-01-03 15:25:29+00:00,FLAGGED,0,NaT,0,0,0.0,
71,5ff475820a7214ada10005cf,,NaT,2021-01-05 14:19:46+00:00,2021-01-05 14:19:46+00:00,NaT,2021-01-05 14:19:46+00:00,SUBMITTED,0,NaT,0,0,0.0,
81,5ff4ce3c0a720f05230005c4,,NaT,2021-01-05 20:38:20+00:00,2021-01-05 20:38:20+00:00,NaT,2021-01-05 20:38:20+00:00,FLAGGED,0,NaT,0,0,0.0,
93,5ff5ecb90a7214ada10005f9,,NaT,2021-01-06 17:00:40+00:00,2021-01-06 17:00:40+00:00,NaT,2021-01-06 17:00:40+00:00,SUBMITTED,0,NaT,0,0,0.0,
141,5ff73be90a720f052300060a,,NaT,2021-01-07 16:50:49+00:00,2021-01-07 16:50:49+00:00,NaT,2021-01-07 16:50:49+00:00,FLAGGED,0,NaT,0,0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,603c6adf0a720fde1000039a,,NaT,2021-03-01 04:17:35.736000+00:00,2021-03-01 04:17:35.736000+00:00,NaT,2021-03-01 04:17:35.736000+00:00,SUBMITTED,0,NaT,0,0,0.0,
1111,603c9e6e0a720fde100003c7,,NaT,2021-03-01 07:57:34.307000+00:00,2021-03-01 07:57:34.307000+00:00,NaT,2021-03-01 07:57:34.307000+00:00,SUBMITTED,0,NaT,0,0,0.0,
1115,603d0b710a720fde1000042a,,NaT,2021-03-01 15:42:41.873000+00:00,2021-03-01 15:42:41.873000+00:00,NaT,2021-03-01 15:42:41.873000+00:00,SUBMITTED,0,NaT,0,0,0.0,
1116,603cf5290a720fde10000413,,NaT,2021-03-01 14:07:37.664000+00:00,2021-03-01 14:07:37.664000+00:00,NaT,2021-03-01 14:07:37.664000+00:00,SUBMITTED,0,NaT,0,0,0.0,


In [31]:
# Any points earned more than 5000? Is that excessive?
receipt[receipt["points_earned"] > 5000]

Unnamed: 0,receipt_id,user_id,purchase_date,date_scanned,create_date,finished_date,modify_date,rewards_receipt_status,bonus_points_earned,points_awarded_date,points_earned,purchased_item_count,total_spent,bonus_points_earned_reason
12,5ff1e1b60a7214ada100055c,,2021-02-03 15:24:38+00:00,2021-01-03 15:24:38+00:00,2021-01-03 15:24:38+00:00,NaT,2021-01-03 15:24:38+00:00,FLAGGED,150,NaT,8850,10,290.0,"Receipt number 5 completed, bonus point schedu..."
78,5fa5ad370a720f05ef000089,,2020-11-05 20:08:23+00:00,2020-11-06 20:08:23+00:00,2020-11-06 20:08:23+00:00,2021-01-05 20:53:40+00:00,2021-01-05 20:53:40+00:00,FINISHED,750,2021-01-05 20:53:40+00:00,9449,11,291.0,"Receipt number 1 completed, bonus point schedu..."
131,5ff7945a0a7214ada1000646,,2021-01-05 12:00:00+00:00,2021-01-05 17:08:10+00:00,2021-01-05 17:08:10+00:00,2021-01-07 17:08:14+00:00,2021-01-07 23:08:16+00:00,FINISHED,750,2021-01-07 17:08:14+00:00,5750,4,27.0,"Receipt number 1 completed, bonus point schedu..."
139,5ff73be10a7214ada1000619,,2021-02-07 16:50:41+00:00,2021-01-07 16:50:41+00:00,2021-01-07 16:50:41+00:00,NaT,2021-01-07 16:50:41+00:00,FLAGGED,0,NaT,8700,10,290.0,
158,5ff873f10a720f052300064f,,2021-02-08 15:02:10+00:00,2021-01-08 15:02:09+00:00,2021-01-08 15:02:09+00:00,NaT,2021-01-08 15:02:10+00:00,FLAGGED,500,NaT,9200,10,290.0,"Receipt number 2 completed, bonus point schedu..."
190,5ffcb4900a720f0515000002,,2021-02-11 20:26:56+00:00,2021-01-11 20:26:56+00:00,2021-01-11 20:26:56+00:00,NaT,2021-01-11 20:26:57+00:00,FLAGGED,250,NaT,8950,10,290.0,"Receipt number 3 completed, bonus point schedu..."
265,5fff26f10a7214ad4c000018,,2021-02-13 16:59:29+00:00,2021-01-13 16:59:29+00:00,2021-01-13 16:59:29+00:00,NaT,2021-01-13 16:59:29+00:00,FLAGGED,0,NaT,8700,10,290.0,
294,6000d4bc0a7214ad4c000070,,2021-02-14 23:33:17+00:00,2021-01-14 23:33:16+00:00,2021-01-14 23:33:16+00:00,NaT,2021-01-14 23:33:17+00:00,FLAGGED,0,NaT,8700,10,290.0,
361,60088d5d0a7214ad890000ed,,2021-01-20 00:00:00+00:00,2021-01-20 20:06:53+00:00,2021-01-20 20:06:53+00:00,2021-01-20 20:06:53+00:00,2021-01-20 20:06:58+00:00,FINISHED,750,2021-01-20 20:06:53+00:00,5850,1,21.0,"Receipt number 1 completed, bonus point schedu..."
374,60088d580a7214ad890000eb,,2021-01-20 00:00:00+00:00,2021-01-20 20:06:48+00:00,2021-01-20 20:06:48+00:00,2021-01-20 20:06:49+00:00,2021-01-20 20:06:54+00:00,FINISHED,750,2021-01-20 20:06:49+00:00,9850,7,26.0,"Receipt number 1 completed, bonus point schedu..."


In [32]:
# Total spent more than 1000? Is that excessive?
receipt[receipt["total_spent"] > 1000]

Unnamed: 0,receipt_id,user_id,purchase_date,date_scanned,create_date,finished_date,modify_date,rewards_receipt_status,bonus_points_earned,points_awarded_date,points_earned,purchased_item_count,total_spent,bonus_points_earned_reason
314,60025cb80a720f05f300008d,,2021-01-15 00:00:00+00:00,2021-01-16 03:25:44+00:00,2021-01-16 03:25:44+00:00,2021-01-16 03:31:55+00:00,2021-01-16 03:32:05+00:00,FINISHED,750,2021-01-16 03:31:55+00:00,1658,335,1177.84,"Receipt number 1 completed, bonus point schedu..."
318,600260210a720f05f300008f,,2021-01-15 00:00:00+00:00,2021-01-16 03:40:17+00:00,2021-01-16 03:40:17+00:00,2021-01-16 03:47:26+00:00,2021-01-16 03:47:31+00:00,FINISHED,750,2021-01-16 03:47:26+00:00,3659,309,1043.18,"Receipt number 1 completed, bonus point schedu..."
407,60099c3c0a7214ad89000135,,2021-01-21 00:00:00+00:00,2021-01-21 15:22:36+00:00,2021-01-21 15:22:36+00:00,2021-01-21 15:28:37+00:00,2021-01-21 15:31:10+00:00,FINISHED,750,2021-01-21 15:28:37+00:00,4480,341,1083.24,"Receipt number 1 completed, bonus point schedu..."
419,600996ac0a720f05fa000134,,2021-01-17 00:00:00+00:00,2021-01-21 14:58:52+00:00,2021-01-21 14:58:52+00:00,2021-01-21 15:11:25+00:00,2021-01-21 15:16:08+00:00,FINISHED,750,2021-01-21 15:11:25+00:00,6257,348,1198.68,"Receipt number 1 completed, bonus point schedu..."
423,600a1a8d0a7214ada2000008,,2021-01-21 00:00:00+00:00,2021-01-22 00:21:32+00:00,2021-01-22 00:21:32+00:00,2021-01-22 00:30:13+00:00,2021-01-22 00:31:04+00:00,FINISHED,750,2021-01-22 00:30:13+00:00,1178,214,1183.1,"Receipt number 1 completed, bonus point schedu..."
431,600ba6ae0a7214ada2000010,,2021-01-22 00:00:00+00:00,2021-01-23 04:31:42+00:00,2021-01-23 04:31:42+00:00,2021-01-23 04:39:28+00:00,2021-01-23 04:41:47+00:00,FINISHED,750,2021-01-23 04:39:28+00:00,1044,136,1107.82,"Receipt number 1 completed, bonus point schedu..."
446,600f24970a720f053500002f,,2021-01-25 00:00:00+00:00,2021-01-25 20:05:43+00:00,2021-01-25 20:05:43+00:00,NaT,2021-01-25 20:25:25+00:00,FLAGGED,0,NaT,0,599,4368.8,
447,600f0cc70a720f053500002c,,2021-01-25 00:00:00+00:00,2021-01-25 18:24:07+00:00,2021-01-25 18:24:07+00:00,NaT,2021-01-25 18:38:07+00:00,FLAGGED,0,NaT,0,303,2084.82,
469,600f39c30a7214ada2000030,,2021-01-24 00:00:00+00:00,2021-01-25 21:36:03+00:00,2021-01-25 21:36:03+00:00,2021-01-26 03:06:03+00:00,2021-01-26 03:07:40+00:00,FINISHED,750,2021-01-26 03:06:03+00:00,7137,670,4721.95,"Receipt number 1 completed, bonus point schedu..."
543,600f2fc80a720f0535000030,,2021-01-24 00:00:00+00:00,2021-01-25 20:53:28+00:00,2021-01-25 20:53:28+00:00,2021-01-25 22:04:23+00:00,2021-01-28 22:37:02+00:00,FINISHED,750,2021-01-25 22:04:23+00:00,4944,689,4566.17,"Receipt number 1 completed, bonus point schedu..."


In [33]:
# Check for dup receipts
receipt[receipt.duplicated(subset=["receipt_id"], keep=False)]

Unnamed: 0,receipt_id,user_id,purchase_date,date_scanned,create_date,finished_date,modify_date,rewards_receipt_status,bonus_points_earned,points_awarded_date,points_earned,purchased_item_count,total_spent,bonus_points_earned_reason


In [34]:
# Are there users from the receipt table that are NOT in the user table?
receipt[~receipt["user_id"].isin(users["user_id"])]

Unnamed: 0,receipt_id,user_id,purchase_date,date_scanned,create_date,finished_date,modify_date,rewards_receipt_status,bonus_points_earned,points_awarded_date,points_earned,purchased_item_count,total_spent,bonus_points_earned_reason
0,5ff1e1eb0a720f0523000575,,2021-01-03 00:00:00+00:00,2021-01-03 15:25:31+00:00,2021-01-03 15:25:31+00:00,2021-01-03 15:25:31+00:00,2021-01-03 15:25:36+00:00,FINISHED,500,2021-01-03 15:25:31+00:00,500,5,26.00,"Receipt number 2 completed, bonus point schedu..."
1,5ff1e1bb0a720f052300056b,,2021-01-02 15:24:43+00:00,2021-01-03 15:24:43+00:00,2021-01-03 15:24:43+00:00,2021-01-03 15:24:43+00:00,2021-01-03 15:24:48+00:00,FINISHED,150,2021-01-03 15:24:43+00:00,150,2,11.00,"Receipt number 5 completed, bonus point schedu..."
2,5ff1e1f10a720f052300057a,,2021-01-03 00:00:00+00:00,2021-01-03 15:25:37+00:00,2021-01-03 15:25:37+00:00,NaT,2021-01-03 15:25:42+00:00,REJECTED,5,NaT,5,1,10.00,All-receipts receipt bonus
3,5ff1e1ee0a7214ada100056f,,2021-01-03 00:00:00+00:00,2021-01-03 15:25:34+00:00,2021-01-03 15:25:34+00:00,2021-01-03 15:25:34+00:00,2021-01-03 15:25:39+00:00,FINISHED,5,2021-01-03 15:25:34+00:00,5,4,28.00,All-receipts receipt bonus
4,5ff1e1d20a7214ada1000561,,2021-01-02 15:25:06+00:00,2021-01-03 15:25:06+00:00,2021-01-03 15:25:06+00:00,2021-01-03 15:25:11+00:00,2021-01-03 15:25:11+00:00,FINISHED,5,2021-01-03 15:25:06+00:00,5,2,1.00,All-receipts receipt bonus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,603cc0630a720fde100003e6,,2020-08-17 00:00:00+00:00,2021-03-01 10:22:27+00:00,2021-03-01 10:22:27+00:00,NaT,2021-03-01 10:22:28+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT
1115,603d0b710a720fde1000042a,,NaT,2021-03-01 15:42:41.873000+00:00,2021-03-01 15:42:41.873000+00:00,NaT,2021-03-01 15:42:41.873000+00:00,SUBMITTED,0,NaT,0,0,0.00,
1116,603cf5290a720fde10000413,,NaT,2021-03-01 14:07:37.664000+00:00,2021-03-01 14:07:37.664000+00:00,NaT,2021-03-01 14:07:37.664000+00:00,SUBMITTED,0,NaT,0,0,0.00,
1117,603ce7100a7217c72c000405,,2020-08-17 00:00:00+00:00,2021-03-01 13:07:28+00:00,2021-03-01 13:07:28+00:00,NaT,2021-03-01 13:07:29+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT


In [35]:
# Are there any purchased_date that occur after the date_scanned? This should not occur.
receipt[receipt["purchase_date"] > receipt["date_scanned"]]

Unnamed: 0,receipt_id,user_id,purchase_date,date_scanned,create_date,finished_date,modify_date,rewards_receipt_status,bonus_points_earned,points_awarded_date,points_earned,purchased_item_count,total_spent,bonus_points_earned_reason
12,5ff1e1b60a7214ada100055c,,2021-02-03 15:24:38+00:00,2021-01-03 15:24:38+00:00,2021-01-03 15:24:38+00:00,NaT,2021-01-03 15:24:38+00:00,FLAGGED,150,NaT,8850,10,290.0,"Receipt number 5 completed, bonus point schedu..."
14,5ff1e1b20a7214ada100055a,,2021-02-03 15:24:35+00:00,2021-01-03 15:24:34+00:00,2021-01-03 15:24:34+00:00,2021-01-03 15:24:35+00:00,2021-01-03 15:24:35+00:00,FINISHED,300,2021-01-03 15:24:35+00:00,300,1,1.0,"Receipt number 4 completed, bonus point schedu..."
85,5ff4ce640a7214ada10005e0,,2021-02-05 20:39:00+00:00,2021-01-05 20:39:00+00:00,2021-01-05 20:39:00+00:00,2021-01-05 20:39:00+00:00,2021-01-05 20:39:00+00:00,FINISHED,25,2021-01-05 20:39:00+00:00,25,1,1.0,COMPLETE_NONPARTNER_RECEIPT
139,5ff73be10a7214ada1000619,,2021-02-07 16:50:41+00:00,2021-01-07 16:50:41+00:00,2021-01-07 16:50:41+00:00,NaT,2021-01-07 16:50:41+00:00,FLAGGED,0,NaT,8700,10,290.0,
158,5ff873f10a720f052300064f,,2021-02-08 15:02:10+00:00,2021-01-08 15:02:09+00:00,2021-01-08 15:02:09+00:00,NaT,2021-01-08 15:02:10+00:00,FLAGGED,500,NaT,9200,10,290.0,"Receipt number 2 completed, bonus point schedu..."
190,5ffcb4900a720f0515000002,,2021-02-11 20:26:56+00:00,2021-01-11 20:26:56+00:00,2021-01-11 20:26:56+00:00,NaT,2021-01-11 20:26:57+00:00,FLAGGED,250,NaT,8950,10,290.0,"Receipt number 3 completed, bonus point schedu..."
244,5fff26ee0a720f05f300001a,,2021-02-13 16:59:26+00:00,2021-01-13 16:59:26+00:00,2021-01-13 16:59:26+00:00,2021-01-13 16:59:26+00:00,2021-01-13 16:59:26+00:00,FINISHED,25,2021-01-13 16:59:26+00:00,25,1,1.0,COMPLETE_NONPARTNER_RECEIPT
265,5fff26f10a7214ad4c000018,,2021-02-13 16:59:29+00:00,2021-01-13 16:59:29+00:00,2021-01-13 16:59:29+00:00,NaT,2021-01-13 16:59:29+00:00,FLAGGED,0,NaT,8700,10,290.0,
294,6000d4bc0a7214ad4c000070,,2021-02-14 23:33:17+00:00,2021-01-14 23:33:16+00:00,2021-01-14 23:33:16+00:00,NaT,2021-01-14 23:33:17+00:00,FLAGGED,0,NaT,8700,10,290.0,
362,600887560a720f05fa000098,,2021-02-20 19:41:10+00:00,2021-01-20 19:41:10+00:00,2021-01-20 19:41:10+00:00,2021-01-20 19:41:11+00:00,2021-01-20 19:41:11+00:00,FINISHED,250,2021-01-20 19:41:11+00:00,250,1,1.0,"Receipt number 3 completed, bonus point schedu..."


In [36]:
# Are there any points awarded where the award date is null?
receipt[(receipt["points_awarded_date"].isnull()) & (receipt["points_earned"] > 0)]

Unnamed: 0,receipt_id,user_id,purchase_date,date_scanned,create_date,finished_date,modify_date,rewards_receipt_status,bonus_points_earned,points_awarded_date,points_earned,purchased_item_count,total_spent,bonus_points_earned_reason
2,5ff1e1f10a720f052300057a,,2021-01-03 00:00:00+00:00,2021-01-03 15:25:37+00:00,2021-01-03 15:25:37+00:00,NaT,2021-01-03 15:25:42+00:00,REJECTED,5,NaT,5,1,10.00,All-receipts receipt bonus
12,5ff1e1b60a7214ada100055c,,2021-02-03 15:24:38+00:00,2021-01-03 15:24:38+00:00,2021-01-03 15:24:38+00:00,NaT,2021-01-03 15:24:38+00:00,FLAGGED,150,NaT,8850,10,290.00,"Receipt number 5 completed, bonus point schedu..."
62,5ff4a4ca0a7214ada10005d0,,2020-09-13 00:00:00+00:00,2021-01-05 17:41:30+00:00,2021-01-05 17:41:30+00:00,NaT,2021-01-05 17:41:32+00:00,REJECTED,750,NaT,750,2,34.96,"Receipt number 1 completed, bonus point schedu..."
129,5ff794bf0a7214ada1000650,,2021-01-05 23:09:51+00:00,2021-01-06 23:09:51+00:00,2021-01-06 23:09:51+00:00,NaT,2021-01-07 17:09:53+00:00,FLAGGED,25,NaT,25,1,1.00,COMPLETE_NONPARTNER_RECEIPT
139,5ff73be10a7214ada1000619,,2021-02-07 16:50:41+00:00,2021-01-07 16:50:41+00:00,2021-01-07 16:50:41+00:00,NaT,2021-01-07 16:50:41+00:00,FLAGGED,0,NaT,8700,10,290.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106,603c7c6c0a7217c72c0003b3,,2020-08-17 00:00:00+00:00,2021-03-01 05:32:28+00:00,2021-03-01 05:32:28+00:00,NaT,2021-03-01 05:32:29+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT
1112,603c3d240a720fde10000373,,2020-08-17 00:00:00+00:00,2021-03-01 01:02:28+00:00,2021-03-01 01:02:28+00:00,NaT,2021-03-01 01:02:29+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT
1113,603cc2bc0a720fde100003e9,,2020-08-17 00:00:00+00:00,2021-03-01 10:32:28+00:00,2021-03-01 10:32:28+00:00,NaT,2021-03-01 10:32:29+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT
1114,603cc0630a720fde100003e6,,2020-08-17 00:00:00+00:00,2021-03-01 10:22:27+00:00,2021-03-01 10:22:27+00:00,NaT,2021-03-01 10:22:28+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT


In [37]:
# Identify receipts that have the same user_id and purchase_date.
# This could indicate duplicate receipt submissions by the same user.
receipt[receipt.duplicated(subset=["user_id", "purchase_date"], keep=False)]

Unnamed: 0,receipt_id,user_id,purchase_date,date_scanned,create_date,finished_date,modify_date,rewards_receipt_status,bonus_points_earned,points_awarded_date,points_earned,purchased_item_count,total_spent,bonus_points_earned_reason
0,5ff1e1eb0a720f0523000575,,2021-01-03 00:00:00+00:00,2021-01-03 15:25:31+00:00,2021-01-03 15:25:31+00:00,2021-01-03 15:25:31+00:00,2021-01-03 15:25:36+00:00,FINISHED,500,2021-01-03 15:25:31+00:00,500,5,26.00,"Receipt number 2 completed, bonus point schedu..."
2,5ff1e1f10a720f052300057a,,2021-01-03 00:00:00+00:00,2021-01-03 15:25:37+00:00,2021-01-03 15:25:37+00:00,NaT,2021-01-03 15:25:42+00:00,REJECTED,5,NaT,5,1,10.00,All-receipts receipt bonus
3,5ff1e1ee0a7214ada100056f,,2021-01-03 00:00:00+00:00,2021-01-03 15:25:34+00:00,2021-01-03 15:25:34+00:00,2021-01-03 15:25:34+00:00,2021-01-03 15:25:39+00:00,FINISHED,5,2021-01-03 15:25:34+00:00,5,4,28.00,All-receipts receipt bonus
8,5ff1e1ed0a7214ada100056e,,2021-01-03 00:00:00+00:00,2021-01-03 15:25:33+00:00,2021-01-03 15:25:33+00:00,2021-01-03 15:25:34+00:00,2021-01-03 15:25:38+00:00,FINISHED,5,2021-01-03 15:25:34+00:00,5,5,20.00,All-receipts receipt bonus
9,5ff1e1eb0a7214ada100056b,,2021-01-03 00:00:00+00:00,2021-01-03 15:25:31+00:00,2021-01-03 15:25:31+00:00,2021-01-03 15:25:31+00:00,2021-01-03 15:25:36+00:00,FINISHED,250,2021-01-03 15:25:31+00:00,250,3,20.00,"Receipt number 3 completed, bonus point schedu..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,603cc0630a720fde100003e6,,2020-08-17 00:00:00+00:00,2021-03-01 10:22:27+00:00,2021-03-01 10:22:27+00:00,NaT,2021-03-01 10:22:28+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT
1115,603d0b710a720fde1000042a,,NaT,2021-03-01 15:42:41.873000+00:00,2021-03-01 15:42:41.873000+00:00,NaT,2021-03-01 15:42:41.873000+00:00,SUBMITTED,0,NaT,0,0,0.00,
1116,603cf5290a720fde10000413,,NaT,2021-03-01 14:07:37.664000+00:00,2021-03-01 14:07:37.664000+00:00,NaT,2021-03-01 14:07:37.664000+00:00,SUBMITTED,0,NaT,0,0,0.00,
1117,603ce7100a7217c72c000405,,2020-08-17 00:00:00+00:00,2021-03-01 13:07:28+00:00,2021-03-01 13:07:28+00:00,NaT,2021-03-01 13:07:29+00:00,REJECTED,25,NaT,25,2,34.96,COMPLETE_NONPARTNER_RECEIPT


In [38]:
# Identify receipts where the total spent is greater than 0 but the purchased item count is 0.
# This could indicate a data issue, as a receipt with a positive total should typically have at least one item purchased.
receipt[(receipt["purchased_item_count"] == 0) & (receipt["total_spent"] > 0)]

Unnamed: 0,receipt_id,user_id,purchase_date,date_scanned,create_date,finished_date,modify_date,rewards_receipt_status,bonus_points_earned,points_awarded_date,points_earned,purchased_item_count,total_spent,bonus_points_earned_reason
210,5ffe1cbe0a7214ad28002843,,2021-01-12 00:00:00+00:00,2021-01-12 22:03:42+00:00,2021-01-12 22:03:42+00:00,2021-01-12 22:03:42+00:00,2021-01-12 22:03:42+00:00,PENDING,0,NaT,0,0,28.57,
211,5ffce7db0a720f051500236e,,2021-01-12 00:00:00+00:00,2021-01-12 00:05:47+00:00,2021-01-12 00:05:47+00:00,2021-01-12 00:05:49+00:00,2021-01-12 00:05:47+00:00,PENDING,0,NaT,0,0,28.57,
213,5ffce8310a7214ad4e003797,,2021-01-12 00:00:00+00:00,2021-01-12 00:07:13+00:00,2021-01-12 00:07:13+00:00,2021-01-12 00:07:14+00:00,2021-01-12 00:07:13+00:00,PENDING,0,NaT,0,0,28.57,
214,5ffe19d90a7214ad28000e62,,2021-01-12 00:00:00+00:00,2021-01-12 21:51:21+00:00,2021-01-12 21:51:21+00:00,2021-01-12 21:51:21+00:00,2021-01-12 21:51:21+00:00,PENDING,0,NaT,0,0,28.57,
215,5ffe1d030a720f05ac002c9e,,2021-01-12 00:00:00+00:00,2021-01-12 22:04:51+00:00,2021-01-12 22:04:51+00:00,2021-01-12 22:04:51+00:00,2021-01-12 22:04:51+00:00,PENDING,0,NaT,0,0,28.57,
218,5ffce76e0a720f0515000b48,,2021-01-12 00:00:00+00:00,2021-01-12 00:03:58+00:00,2021-01-12 00:03:58+00:00,2021-01-12 00:04:00+00:00,2021-01-12 00:03:58+00:00,PENDING,0,NaT,0,0,28.57,
219,5ffe23560a720f05ac006874,,2021-01-12 00:00:00+00:00,2021-01-12 22:31:50+00:00,2021-01-12 22:31:50+00:00,2021-01-12 22:31:50+00:00,2021-01-12 22:31:50+00:00,PENDING,0,NaT,0,0,28.57,
220,5ffe22a20a720f05ac0061d7,,2021-01-12 00:00:00+00:00,2021-01-12 22:28:50+00:00,2021-01-12 22:28:50+00:00,2021-01-12 22:28:50+00:00,2021-01-12 22:28:50+00:00,PENDING,0,NaT,0,0,28.57,
221,5ffe1dc20a7214ad28003180,,2021-01-12 00:00:00+00:00,2021-01-12 22:08:02+00:00,2021-01-12 22:08:02+00:00,2021-01-12 22:08:03+00:00,2021-01-12 22:08:02+00:00,PENDING,0,NaT,0,0,28.57,
224,5ffe1aa90a7214ad280015e8,,2021-01-12 00:00:00+00:00,2021-01-12 21:54:49+00:00,2021-01-12 21:54:49+00:00,2021-01-12 21:54:49+00:00,2021-01-12 21:54:49+00:00,PENDING,0,NaT,0,0,28.57,


In [39]:
import os

# Create output directory
output_dir = "/Users/evro/Documents/code/python/fetch/data/validated"
os.makedirs(output_dir, exist_ok=True)

# Save as pickle for debugging
output_file_pkl = os.path.join(output_dir, "receipts.pkl")
with open(output_file_pkl, "wb") as f:
    pickle.dump(receipt, f)