Import all packages

In [2]:
import sqlite3
import pandas as pd
import tabulate
# import numpy as np
# import matplotlib.pyplot as plt

Turn the CSV files into SQLlite database tables

In [35]:
conn = sqlite3.connect('fetch.sqlite')
df = pd.read_csv('USER_TAKEHOME.csv')
df.to_sql('users', conn, if_exists='replace', index=False)
df = pd.read_csv('PRODUCTS_TAKEHOME.csv')
df.to_sql('products', conn, if_exists='replace', index=False)
df = pd.read_csv('TRANSACTION_TAKEHOME.csv')
df.to_sql('transactions', conn, if_exists='replace', index=False)
conn.close

<function Connection.close()>

Check data quality and cleanup the databases where necessary

In [5]:
conn = sqlite3.connect('fetch.sqlite')
cur = conn.cursor()

#Users
cur.execute('''
select count(*) as n_rows
    , count(distinct id) as n_users
    , count(distinct case when created_date is not null then id else null end) as n_with_created_dates
    , count(distinct case when birth_date is not null then id else null end) as n_with_birth_dates
    , count(distinct case when state is not null then id else null end) as n_with_state
    , count(distinct case when language is not null then id else null end) as n_with_language
    , count(distinct case when gender is not null then id else null end) as n_with_gender
from users
''')

cur.execute('''
select state, count(*) from users group by state order by count(*) desc
''')

cur.execute('''
select count(distinct state) from users
''')

cur.execute('''
select language, count(*) from users group by language order by count(*) desc
''')

cur.execute('''
select gender, count(*), min(created_date), max(created_date) from users group by gender order by count(*) desc
''')

tabulate.tabulate(cur.fetchall(), tablefmt='html')


0,1,2,3
female,64240,2014-07-09 01:24:05.000 Z,2024-09-11 17:58:04.000 Z
male,25829,2014-09-09 22:03:56.000 Z,2024-09-11 17:56:53.000 Z
,5892,2014-04-18 23:14:55.000 Z,2024-09-11 17:59:15.000 Z
transgender,1772,2017-03-05 19:37:20.000 Z,2022-08-06 21:07:23.000 Z
prefer_not_to_say,1350,2015-07-23 22:28:51.000 Z,2024-09-11 17:52:47.000 Z
non_binary,473,2018-11-28 23:02:36.000 Z,2024-09-11 17:53:02.000 Z
unknown,196,2021-01-12 19:03:07.000 Z,2021-08-27 03:47:51.000 Z
not_listed,180,2021-08-28 22:23:02.000 Z,2024-09-09 04:18:00.000 Z
Non-Binary,34,2022-05-15 22:17:26.000 Z,2022-06-19 13:44:09.000 Z
not_specified,28,2020-12-15 16:06:04.000 Z,2021-01-23 00:02:40.000 Z


Users has a unique ID and every entry has a created_date. 96% coverage for birth dates. 95% for state. 69% for language and 94% for gender.

Dates are stored as text. I'd normally transform them to timestamps, but sqlite has no native date format and using strings is standard.

Birth Date is stored as a timestamp, but most of the values have no time component. However, some do. For now, I'll just leave this as is.

State contains all 50 states plus PR and DC

Language is mostly 'en', but with a few 'es-419'. These are IETF language tags and mean "English" and "Spanish appropriate for the Latin America and Caribbean region"

Gender has multiple options: female, male, transgender, prefer_not_to_say, non_binary, unknown, not_listed, Non-Binary, 
not_specified, "My gender isn't listed" and 'Prefer not to say'. It appears that options have been added and removed. 'male', 'female' and NULL all have entries created from 2014 through September 2024 (max timestamp). 'transgender' first appears in 2017 and then stops in 2022. 'prefer_not_to_say' first appears in 2015 and continues through the present. 'non_binary' first appears in 2018 and continues through the present. 'unknown' only appears between January and August 2021. 'not_listed' first appears the following day in August 2021 and continues through the present, so it must have replaced 'unknown'. 'Non-Binary' is only seen in May and June 2022, so that might be a bug that was fixed where the value was being recorded as 'Non-Binary' instead of 'non-binary'. 'not_specified' only appears between December 2020 and January 2021, so that appears to have been replaced by 'unknown'. "My gender isn't listed" only appears between April and June of 2022. "Prefer not to say" only has one value on June of 2022. 

Here's how I will modify the values stored in the gender column:
* female: keep
* male: keep
* NULL: keep
* transgender: keep
* prefer_not_to_say: keep
* non_binary: keep
* unknown: merged into not_listed
* not_listed: keep
* Non-Binary: merged into 'non_binary'
* not_specified: merged into 'not_listed'
* My gender isn't listed: merged into 'not_listed'
* Prefer not to say: merged into prefer_not_to_say


In [36]:
# users Cleanup

conn = sqlite3.connect('fetch.sqlite')
cur = conn.cursor()

# rename created_date to created_at because it's a timestamp, not a date
cur.execute('''
alter table users rename column created_date to created_at;
''')

# rename `id` to `user_id` because that's my prefered style for analysis tables
cur.execute('''
alter table users rename column id to user_id;
''')

cur.close
conn.close

<function Connection.close()>

In [37]:
conn = sqlite3.connect('fetch.sqlite')
cur = conn.cursor()


# map values for gender
cur.execute('''
update users set gender = 'not_listed' where gender='unknown'
''')
cur.execute('''
update users set gender = 'not_listed' where gender='not_specified'
''')
cur.execute('''
update users set gender = 'not_listed' where gender='My gender isn''t listed'
''')
cur.execute('''
update users set gender = 'non_binary' where gender='Non-Binary'
''')
cur.execute('''
update users set gender = 'prefer_not_to_say' where gender='Prefer not to say'
''')

# cur.execute('''
# select gender, count(*), min(created_at), max(created_at) from users group by gender order by count(*) desc
# ''')


# tabulate.tabulate(cur.fetchall(), tablefmt='html')

cur.close
conn.close

<function Connection.close()>

Product Exploration

In [33]:
conn = sqlite3.connect('fetch.sqlite')
cur = conn.cursor()

# Product
cur.execute('''
select count(*) as n_rows
    , count(distinct barcode) as n_barcodes
    , sum(case when barcode is not null then 1 else 0 end) n_with_barcodes
    , sum(case when brand is not null then 1 else 0 end) n_with_brand
    , sum(case when manufacturer is not null then 1 else 0 end) n_with_manufacturer
    , sum(case when category_1 is not null then 1 else 0 end) n_with_category_1
    , sum(case when category_2 is not null then 1 else 0 end) n_with_category_2
    , sum(case when category_3 is not null then 1 else 0 end) n_with_category_3
    , sum(case when category_4 is not null then 1 else 0 end) n_with_category_4
from products
''')

cur.execute('''
select brand
    , manufacturer
    , category_1
    , category_2
    , category_3
    , category_4
    , count(*)
from products
    where category_4 is not null
group by brand
    , manufacturer
    , category_1
    , category_2
    , category_3
    , category_4
''')

cur.execute('''
select brand
    , count(*)
from products
    where brand is not null
group by brand
order by count(*) desc
limit 100
''')

cur.execute('''
select manufacturer
    , count(*)
from products
    where manufacturer is not null
group by manufacturer
order by count(*) desc
limit 100
''')

cur.execute('''
select category_1
    , count(*)
from products
    where category_1 is not null
group by category_1
order by count(*) desc
limit 100
''')

cur.execute('''
select category_2
    , count(*)
from products
    where category_2 is not null
group by category_2
order by count(*) desc
limit 10
''')

tabulate.tabulate(cur.fetchall(), tablefmt='html')


0,1
Candy,121036
Hair Care,111482
Medicines & Treatments,99118
Bath & Body,81469
Skin Care,62587
Nuts & Seeds,33522
Cookies,30418
Eye Care,25017
Chips,23728
Oral Care,22514


Most, but not all rows have a barcode. 
Barcodes are not unique. 
Brand and Manufacturer are on about 75% of rows. The most common Manufacturer is Placeholder Manufacturer. Brand Not Known is the second most common brand.
Category 1 and 2 are on almost all rows. Category 3 is on most and Category 4 is on very few. 
The vast majority of Products are in the top two Category_1 values: Health & Wellness and Snacks



In [3]:
#Product Cleanup
#Create a view that is de-duped on barcode, excludes NULL barcodes
conn = sqlite3.connect('fetch.sqlite')
cur = conn.cursor()

cur.execute('''
create view products_clean as 
    with products_rn as 
        (
        select barcode, manufacturer, brand, category_1, category_2, category_3, category_4
        , row_number() over (partition by barcode order by category_1 nulls last, category_2 nulls last, category_3 nulls last) as barcode_rn
        from products
        where barcode is not null
        )
    select barcode, manufacturer, brand, category_1, category_2, category_3, category_4
    from products_rn
    where barcode_rn = 1
''')

conn.close 
cur.close

<function Cursor.close()>

In [3]:
conn = sqlite3.connect('fetch.sqlite')
cur = conn.cursor()

# # Transactions
# cur.execute('''
# select count(*) as n_rows
#     , count(distinct receipt_id) as n_receipts
#     , count(distinct case when purchase_date is not null then receipt_id else null end) as n_with_purchase_dates
#     , count(distinct case when scan_date is not null then receipt_id else null end) as n_with_scan_dates
#     , count(distinct case when store_name is not null then receipt_id else null end) as n_with_store_name
#     , count(distinct case when user_id is not null then receipt_id else null end) as n_with_user_id
#     , count(case when barcode is not null then receipt_id else null end) as n_with_barcode
# from transactions
# ''')

# #check valid user_ids

# cur.execute('''
# select count(distinct transactions.user_id) as n_users, count(*) as n_rows
# from transactions
#     inner join users on transactions.user_id = users.user_id
# ''')

# # cur.execute('''
# # select user_id, count(*) as n_rows
# # from transactions
# # group by user_id 
# # order by n_rows desc 
# # limit 10
# # ''')

# # # top user: 64e62de5ca929250373e6cf5
# # cur.execute('''
# # select *
# # from users
# # where lower(user_id) like '64e62de%'
# # ''')

# # look at time period for data

# cur.execute('''
# select min(scan_date), max(scan_date), min(purchase_date), max(purchase_date)
# from transactions
# ''')

# cur.execute('''
# select *
# from transactions
# where date(scan_date) <> purchase_date
# limit 10
# ''')

# cur.execute('''
# select count(*)
# from transactions
# where date(scan_date) < purchase_date
# ''')

# #figure out the quantity and pricing logic

# cur.execute('''
# select store_name, count(*) as n_rows, min(final_quantity), max(final_quantity), min(final_sale), max(final_sale)
# from transactions
# group by store_name order by n_rows desc
# ''')

# cur.execute('''
# select final_sale, count(*) as n_rows
# from transactions
# where final_quantity = 'zero'
# group by final_sale order by final_sale asc
# ''')

# cur.execute('''
# select min(final_quantity), max(final_quantity)
# from transactions
# where final_quantity <> 'zero'
# ''')

# cur.execute('''
# select case when final_sale = ' ' then 'blank sale' else 'not blank sale' end as sale_flag
# , count(*) as n_rows
# from transactions
# where final_quantity <> 'zero'
# group by sale_flag
# ''')

# Quantity is a float between 0.01 and 9, unless it has the value of 'zero'
# if quantity is 'zero', then there is a final sale value which can is between 0 and max value of 93.67
# if Quantity is not zero, there may or may not be a value for for the final sale. If there isn't, it's equal to ' '. Not blank or NULL


# validity of barcode data

cur.execute('''
select case when p.barcode is not null then 'barcode in products' else 'barcode not in products' end as barcode_flag
, count(*) as n_rows
from transactions as t
    left join products p on t.barcode = p.barcode
where t.barcode is not null
group by barcode_flag
''')

tabulate.tabulate(cur.fetchall(), tablefmt='html')



0,1
barcode in products,24854
barcode not in products,19408


There is no unique key for this table. receipt ID is for the "basket" of items.
100% data coverage for purchase date, scan date, store name and user_id
But only 88% coverage for barcode. The rest of those items do not have barcodes. 

While there is a `user_id` for every row, only a small number (91) of the `user_id`s exist in the `users` table. This will essentially prohibit transaction analysis from using user data.

Transactions are all scanned between June 12th and September 8th of 2024. Purchase dates are all in the same date range. 

The purchase date is often the same day as the scan, but not always. Scan is usually after purchase, but not always. I'd assume that the purchase date is manually entered by the customer and the scan date is machine generated (ie when a receipt is scanned in).

Walmart is by far the most common store, with about 40% of the entries. However, there is a very very long tail. 

In [None]:
# transactions Cleanup

# rename scan_date to scan_at because it's a timestamp, not a date
cur.execute('''
alter table transactions rename column scan_date to scan_at;
''')

# replace final_quantity = 'zero' with NULL
cur.execute('''
update transactions set final_quantity = NULL where final_quantity = 'zero'
''')

# replace final_sale = ' ' with NULL
cur.execute('''
update transactions set final_sale = NULL where final_sale = ' '
''')

<sqlite3.Cursor at 0x117bc9340>