In [18]:
import boto3
import urllib3
import pandas as pd
from io import BytesIO


#Disable warning for cleaning the terminal
urllib3.disable_warnings()


# We will use Amazon S3. We create a high-level resource object
# for interacting with AWS


#Adding verify = False as same issue in previous modules with corporate proxy
s3 = boto3.resource('s3',
                aws_access_key_id='AKIAXN64CPXKVY56HGZZ',
                aws_secret_access_key='XXX',
                verify = False)

bucket_name = 'zrive-ds-data'
prefix = 'groceries/sampled-datasets/'

# Create a bucket object
bucket = s3.Bucket(bucket_name)

# Empty dict to store dfs when reading

#Iterate through the objects inside
for obj in bucket.objects.filter(Prefix = prefix):
    key = obj.key

    if key.endswith('.parquet'):
        print(f"-- Reading Parquet file: {key}")
        
        try:
            # Get the S3 object
            s3_object = s3.Object(bucket_name, key)

            # Get the parquet file as bytes
            response = s3_object.get()
            parquet_bytes = response['Body'].read()

            # Create a BytesIO object for seeking
            parquet_io = BytesIO(parquet_bytes)

            df_name = key.split('/')[-1].split('.')[0]

            globals()[f'df_{df_name}'] = pd.read_parquet(parquet_io)

            print(f"The number of cols and rows is: {globals()[f'df_{df_name}'].shape}")
            print(globals()[f'df_{df_name}'].head())
            print(f"-- The df named df_{df_name} has been saved\n")

        except IOError as io_err:
            print(f"IOError reading {key}: {io_err}")
        except pd.errors.ParserError as parser_err:
            print(f"ParserError reading {key}: {parser_err}")
        except TypeError as type_err:
            if "a bytes-like object is required, not 'str'" in str(type_err):
                    print(f"TypeError: The Parquet file {key} is not in bytes format.")
            else:
                    print(f"TypeError reading {key}: {type_err}")

-- Reading Parquet file: groceries/sampled-datasets/abandoned_carts.parquet
The number of cols and rows is: (5457, 4)
                id                                            user_id  \
0   12858560217220  5c4e5953f13ddc3bc9659a3453356155e5efe4739d7a2b...   
13  20352449839236  9d6187545c005d39e44d0456d87790db18611d7c7379bd...   
45  20478401413252  e83fb0273d70c37a2968fee107113698fd4f389c442c0b...   
50  20481783103620  10c42e10e530284b7c7c50f3a23a98726d5747b8128084...   
52  20485321687172  d9989439524b3f6fc4f41686d043f315fb408b954d6153...   

            created_at                                         variant_id  
0  2020-05-20 13:53:24  [33826459287684, 33826457616516, 3366719212762...  
13 2021-06-27 05:24:13  [34415988179076, 34037940158596, 3450282236326...  
45 2021-07-18 08:23:49  [34543001337988, 34037939372164, 3411360609088...  
50 2021-07-18 21:29:36  [33667268116612, 34037940224132, 3443605520397...  
52 2021-07-19 12:17:05   [33667268083844, 34284950454404, 33973

### Orders

An orders history of customers. Each row is an order and the
item_ids for the order are stored as a list in the item_ids column


In [25]:

df_orders.head()

Unnamed: 0,id,user_id,created_at,order_date,user_order_seq,ordered_items
10,2204073066628,62e271062eb827e411bd73941178d29b022f5f2de9d37f...,2020-04-30 14:32:19,2020-04-30,1,"[33618849693828, 33618860179588, 3361887404045..."
20,2204707520644,bf591c887c46d5d3513142b6a855dd7ffb9cc00697f6f5...,2020-04-30 17:39:00,2020-04-30,1,"[33618835243140, 33618835964036, 3361886244058..."
21,2204838822020,329f08c66abb51f8c0b8a9526670da2d94c0c6eef06700...,2020-04-30 18:12:30,2020-04-30,1,"[33618891145348, 33618893570180, 3361889766618..."
34,2208967852164,f6451fce7b1c58d0effbe37fcb4e67b718193562766470...,2020-05-01 19:44:11,2020-05-01,1,"[33618830196868, 33618846580868, 3361891234624..."
49,2215889436804,68e872ff888303bff58ec56a3a986f77ddebdbe5c279e7...,2020-05-03 21:56:14,2020-05-03,1,"[33667166699652, 33667166699652, 3366717122163..."


### Regulars

Users are allowed to specify items that they wish to buy
regularly. This data gives the items each user has asked to get regularly, along
with when they input that information.

In [26]:
df_regulars.head()

Unnamed: 0,user_id,variant_id,created_at
3,68e872ff888303bff58ec56a3a986f77ddebdbe5c279e7...,33618848088196,2020-04-30 15:07:03
11,aed88fc0b004270a62ff1fe4b94141f6b1db1496dbb0c0...,33667178659972,2020-05-05 23:34:35
18,68e872ff888303bff58ec56a3a986f77ddebdbe5c279e7...,33619009208452,2020-04-30 15:07:03
46,aed88fc0b004270a62ff1fe4b94141f6b1db1496dbb0c0...,33667305373828,2020-05-05 23:34:35
47,4594e99557113d5a1c5b59bf31b8704aafe5c7bd180b32...,33667247341700,2020-05-06 14:42:11


### Abandoned cart

If a user has added items to their basket but not
bought them, we capture that information. Items that were abandoned are stored
as a list in item_ids.


In [27]:
df_abandoned_carts.head()

Unnamed: 0,id,user_id,created_at,variant_id
0,12858560217220,5c4e5953f13ddc3bc9659a3453356155e5efe4739d7a2b...,2020-05-20 13:53:24,"[33826459287684, 33826457616516, 3366719212762..."
13,20352449839236,9d6187545c005d39e44d0456d87790db18611d7c7379bd...,2021-06-27 05:24:13,"[34415988179076, 34037940158596, 3450282236326..."
45,20478401413252,e83fb0273d70c37a2968fee107113698fd4f389c442c0b...,2021-07-18 08:23:49,"[34543001337988, 34037939372164, 3411360609088..."
50,20481783103620,10c42e10e530284b7c7c50f3a23a98726d5747b8128084...,2021-07-18 21:29:36,"[33667268116612, 34037940224132, 3443605520397..."
52,20485321687172,d9989439524b3f6fc4f41686d043f315fb408b954d6153...,2021-07-19 12:17:05,"[33667268083844, 34284950454404, 33973246886020]"


### Inventory parquet

Some info about each item id

In [28]:
df_inventory.head()

Unnamed: 0,variant_id,price,compare_at_price,vendor,product_type,tags
0,39587297165444,3.09,3.15,heinz,condiments-dressings,"[table-sauces, vegan]"
1,34370361229444,4.99,5.5,whogivesacrap,toilet-roll-kitchen-roll-tissue,"[b-corp, eco, toilet-rolls]"
2,34284951863428,3.69,3.99,plenty,toilet-roll-kitchen-roll-tissue,[kitchen-roll]
3,33667283583108,1.79,1.99,thecheekypanda,toilet-roll-kitchen-roll-tissue,"[b-corp, cruelty-free, eco, tissue, vegan]"
4,33803537973380,1.99,2.09,colgate,dental,[dental-accessories]


### Users

Information about users

In [29]:
df_users.head()

Unnamed: 0,user_id,user_segment,user_nuts1,first_ordered_at,customer_cohort_month,count_people,count_adults,count_children,count_babies,count_pets
2160,0e823a42e107461379e5b5613b7aa00537a72e1b0eaa7a...,Top Up,UKH,2021-05-08 13:33:49,2021-05-01 00:00:00,,,,,
1123,15768ced9bed648f745a7aa566a8895f7a73b9a47c1d4f...,Top Up,UKJ,2021-11-17 16:30:20,2021-11-01 00:00:00,,,,,
1958,33e0cb6eacea0775e34adbaa2c1dec16b9d6484e6b9324...,Top Up,UKD,2022-03-09 23:12:25,2022-03-01 00:00:00,,,,,
675,57ca7591dc79825df0cecc4836a58e6062454555c86c35...,Top Up,UKI,2021-04-23 16:29:02,2021-04-01 00:00:00,,,,,
4694,085d8e598139ce6fc9f75d9de97960fa9e1457b409ec00...,Top Up,UKJ,2021-11-02 13:50:06,2021-11-01 00:00:00,,,,,
