In [1]:
import csv
import pandas as pd
import re
import sqlite3
from sqlite3 import Error
import ast 
import math
import os
import json
from datetime import datetime

In [2]:
receipts_df = pd.read_json('receipts.json', lines=True)
brands_df = pd.read_json('brands.json', lines=True)
users_df = pd.read_json('users.json', lines=True)

### Clean Receipts
- 1): Remove dictionary key from ids
- 2): Change date format
- 3): Make sure numerical columns have type as numeric
- 4): use encoding = utf-8 for next step schema design

In [3]:
def clean_receipts_dataset(df):
    
    df['_id'] = df['_id'].apply(lambda x: x['$oid'] if isinstance(x, dict) and '$oid' in x else x)
    df['userId'] = df['userId'].apply(lambda x: x['$oid'] if isinstance(x, dict) and '$oid' in x else x)
    
    # Clean date columns
    date_columns = ['createDate', 'dateScanned', 'finishedDate', 'modifyDate', 'pointsAwardedDate', 'purchaseDate']
    for col in date_columns:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if isinstance(x, dict) and '$date' in x else x)
    
    # Clean numeric columns
    numeric_columns = ['bonusPointsEarned', 'pointsEarned', 'purchasedItemCount', 'totalSpent']
    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
    df.to_csv('cleaned_receipts.csv', index=False, encoding='utf-8')
    return df

cleaned_receipts_df = clean_receipts_dataset(receipts_df)
cleaned_receipts_df.head()

Unnamed: 0,_id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,5ff1e1eb0a720f0523000575,500.0,"Receipt number 2 completed, bonus point schedu...",2021-01-03 09:25:31,2021-01-03 09:25:31,2021-01-03 09:25:31,2021-01-03 09:25:36,2021-01-03 09:25:31,500.0,2021-01-02 18:00:00,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,5ff1e1bb0a720f052300056b,150.0,"Receipt number 5 completed, bonus point schedu...",2021-01-03 09:24:43,2021-01-03 09:24:43,2021-01-03 09:24:43,2021-01-03 09:24:48,2021-01-03 09:24:43,150.0,2021-01-02 09:24:43,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052
2,5ff1e1f10a720f052300057a,5.0,All-receipts receipt bonus,2021-01-03 09:25:37,2021-01-03 09:25:37,NaT,2021-01-03 09:25:42,NaT,5.0,2021-01-02 18:00:00,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b
3,5ff1e1ee0a7214ada100056f,5.0,All-receipts receipt bonus,2021-01-03 09:25:34,2021-01-03 09:25:34,2021-01-03 09:25:34,2021-01-03 09:25:39,2021-01-03 09:25:34,5.0,2021-01-02 18:00:00,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6
4,5ff1e1d20a7214ada1000561,5.0,All-receipts receipt bonus,2021-01-03 09:25:06,2021-01-03 09:25:06,2021-01-03 09:25:11,2021-01-03 09:25:11,2021-01-03 09:25:06,5.0,2021-01-02 09:25:06,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052


### Clean Users
- 1): Remove dictionary key from ids
- 2): Change date format
- 3): Change active column from T/F to 1/0
- 4): Remove duplicated rows from User table
- 5): use encoding = utf-8 for next step schema design

In [4]:
def clean_users_dataset(df):
    
    df['_id'] = df['_id'].apply(lambda x: x['$oid'] if isinstance(x, dict) and '$oid' in x else x)

    # Clean date columns
    date_columns = ['createdDate', 'lastLogin']
    for col in date_columns:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: datetime.fromtimestamp(x['$date'] / 1000) if isinstance(x, dict) and '$date' in x else x)
    
    df['active'] = df['active'].astype(int)
    df = df.drop_duplicates(subset=['_id'])
    df.to_csv('cleaned_users.csv', index=False, encoding='utf-8')
    return df

cleaned_users_df = clean_users_dataset(users_df)
cleaned_users_df.head()

Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,5ff1e194b6a9d73a3a9f1052,1,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,1,2021-01-03 09:25:30.554,2021-01-03 09:25:30.597,consumer,Email,WI
6,5ff1e1e8cfcf6c399c274ad9,1,2021-01-03 09:25:28.354,2021-01-03 09:25:28.392,consumer,Email,WI
7,5ff1e1b7cfcf6c399c274a5a,1,2021-01-03 09:24:39.626,2021-01-03 09:24:39.665,consumer,Email,WI
9,5ff1e1f1cfcf6c399c274b0b,1,2021-01-03 09:25:37.564,2021-01-03 09:25:37.599,consumer,Email,WI


### Clean Brands
- 1): Remove dictionary key from ids
- 2): use encoding = utf-8 for next step schema design

In [5]:
def clean_brands_dataset(df):
    
    df['_id'] = df['_id'].apply(lambda x: x['$oid'] if isinstance(x, dict) and '$oid' in x else x)
    
    def extract_oid(value):
        if isinstance(value, dict) and '$id' in value and isinstance(value['$id'], dict) and '$oid' in value['$id']:
            return value['$id']['$oid']
        return None

    df['cpg'] = df['cpg'].apply(extract_oid)
    
    df.to_csv('cleaned_brands.csv', index=False, encoding='utf-8')
    return df

cleaned_brands_df = clean_brands_dataset(brands_df)
cleaned_brands_df.head()

Unnamed: 0,_id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,601ac115be37ce2ead437551,511111019862,Baking,BAKING,601ac114be37ce2ead437550,test brand @1612366101024,0.0,
1,601c5460be37ce2ead43755f,511111519928,Beverages,BEVERAGES,5332f5fbe4b03c9a25efd0ba,Starbucks,0.0,STARBUCKS
2,601ac142be37ce2ead43755d,511111819905,Baking,BAKING,601ac142be37ce2ead437559,test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176
3,601ac142be37ce2ead43755a,511111519874,Baking,BAKING,601ac142be37ce2ead437559,test brand @1612366146051,0.0,TEST BRANDCODE @1612366146051
4,601ac142be37ce2ead43755e,511111319917,Candy & Sweets,CANDY_AND_SWEETS,5332fa12e4b03c9a25efd1e7,test brand @1612366146827,0.0,TEST BRANDCODE @1612366146827
