In [72]:
import boto3
import urllib3
import pandas as pd
from io import BytesIO


#Disable warning for cleaning the terminal
urllib3.disable_warnings()


# We will use Amazon S3. We create a high-level resource object
# for interacting with AWS


#Adding verify = False as same issue in previous modules with corporate proxy
s3 = boto3.resource('s3',
                aws_access_key_id='AKIAXN64CPXKVY56HGZZ',
                aws_secret_access_key='XXX',
                verify = False)

bucket_name = 'zrive-ds-data'
prefix = 'groceries/sampled-datasets/'

# Create a bucket object
bucket = s3.Bucket(bucket_name)

# Empty dict in case we need to iterate over
dfs = {}

#Iterate through the objects inside
for obj in bucket.objects.filter(Prefix = prefix):
    key = obj.key

    # We will keep a list of dfs in case we need to iterate over
    if key.endswith('.parquet'):
        print(f"-- Reading Parquet file: {key}")
        
        try:
            # Get the S3 object
            s3_object = s3.Object(bucket_name, key)

            # Get the parquet file as bytes
            response = s3_object.get()
            parquet_bytes = response['Body'].read()

            # Create a BytesIO object for seeking
            parquet_io = BytesIO(parquet_bytes)

            # We retrieve the actual filename
            df_name = key.split('/')[-1].split('.')[0]

            # Save as individual files and also inside a dict
            df = globals()[f'df_{df_name}'] = pd.read_parquet(parquet_io)
            dfs[df_name] = df

            print(f"The number of cols and rows is: {df.shape}")
            print(df.head())
            print(f"-- The df named df_{df_name} has been saved\n")

        except IOError as io_err:
            print(f"IOError reading {key}: {io_err}")
        except pd.errors.ParserError as parser_err:
            print(f"ParserError reading {key}: {parser_err}")
        except TypeError as type_err:
            if "a bytes-like object is required, not 'str'" in str(type_err):
                    print(f"TypeError: The Parquet file {key} is not in bytes format.")
            else:
                    print(f"TypeError reading {key}: {type_err}")

-- Reading Parquet file: groceries/sampled-datasets/abandoned_carts.parquet
The number of cols and rows is: (5457, 4)
                id                                            user_id  \
0   12858560217220  5c4e5953f13ddc3bc9659a3453356155e5efe4739d7a2b...   
13  20352449839236  9d6187545c005d39e44d0456d87790db18611d7c7379bd...   
45  20478401413252  e83fb0273d70c37a2968fee107113698fd4f389c442c0b...   
50  20481783103620  10c42e10e530284b7c7c50f3a23a98726d5747b8128084...   
52  20485321687172  d9989439524b3f6fc4f41686d043f315fb408b954d6153...   

            created_at                                         variant_id  
0  2020-05-20 13:53:24  [33826459287684, 33826457616516, 3366719212762...  
13 2021-06-27 05:24:13  [34415988179076, 34037940158596, 3450282236326...  
45 2021-07-18 08:23:49  [34543001337988, 34037939372164, 3411360609088...  
50 2021-07-18 21:29:36  [33667268116612, 34037940224132, 3443605520397...  
52 2021-07-19 12:17:05   [33667268083844, 34284950454404, 33973

In [67]:
# We have saved the 5 dfs inside a dict called dfs
# We have saved each df with its name df_{parquet_file}

# EDA (in progress)

We have been presented with different datasets from a groceries e-commerce platform selling directly to consumers. In that sense, we have 5 different tables with different info in each of them.

- df_orders: An order history of customers. Each row is an order and the
item_ids for the order are stored as a list in the item_ids column
- df_regulars:  Users are allowed to specify items that they wish to buy
regularly. This data gives the items each user has asked to get regularly, along
with when they input that information.
- df_abandoned cart: If a user has added items to their basket but not
bought them, we capture that information. Items that were abandoned are stored
as a list in item_ids.
- df_inventory: Some information about each item_id
- df_users: Information about users.

1. Priorly to working with data, we will try to inspect each of the dataframes in order to understand the information. With the description of each table provided, we would try to figure out how the data looks like - then we could start thinking of some hypothesis and proceed with its verification.

2. We may display them, inspect columns and dtypes, verify if everythings makes sense about the metadata. After that, we could try to look for possible NANs or Null values, doublechecking data quality is OK. If we find any issue or problem during the process, with the prior knowledge obtained, we should be able to make a decision and justify the solution applied.

3. Once data has been fixed, we may join or transform them in order to work with that in future steps.

3. After all, we will formulate the corresponding hypothesis tested about the data plus some commentaries and additional info we have found.

### Orders

An orders history of customers. Each row is an order and the
item_ids for the order are stored as a list in the item_ids column


In [86]:
print(df_orders.shape)
df_orders.head()

(8773, 6)


Unnamed: 0,id,user_id,created_at,order_date,user_order_seq,ordered_items
10,2204073066628,62e271062eb827e411bd73941178d29b022f5f2de9d37f...,2020-04-30 14:32:19,2020-04-30,1,"[33618849693828, 33618860179588, 3361887404045..."
20,2204707520644,bf591c887c46d5d3513142b6a855dd7ffb9cc00697f6f5...,2020-04-30 17:39:00,2020-04-30,1,"[33618835243140, 33618835964036, 3361886244058..."
21,2204838822020,329f08c66abb51f8c0b8a9526670da2d94c0c6eef06700...,2020-04-30 18:12:30,2020-04-30,1,"[33618891145348, 33618893570180, 3361889766618..."
34,2208967852164,f6451fce7b1c58d0effbe37fcb4e67b718193562766470...,2020-05-01 19:44:11,2020-05-01,1,"[33618830196868, 33618846580868, 3361891234624..."
49,2215889436804,68e872ff888303bff58ec56a3a986f77ddebdbe5c279e7...,2020-05-03 21:56:14,2020-05-03,1,"[33667166699652, 33667166699652, 3366717122163..."


In [78]:
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8773 entries, 10 to 64538
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              8773 non-null   int64         
 1   user_id         8773 non-null   object        
 2   created_at      8773 non-null   datetime64[ns]
 3   order_date      8773 non-null   datetime64[ns]
 4   user_order_seq  8773 non-null   int64         
 5   ordered_items   8773 non-null   object        
dtypes: datetime64[ns](2), int64(2), object(2)
memory usage: 479.8+ KB


In [37]:
print(df_orders.describe())

                 id                     created_at  \
count  8.773000e+03                           8773   
mean   3.684684e+12  2021-08-22 03:54:18.750028288   
min    2.204073e+12            2020-04-30 14:32:19   
25%    3.690255e+12            2021-04-25 11:50:37   
50%    3.846692e+12            2021-10-11 11:29:44   
75%    3.945086e+12            2022-01-03 18:14:23   
max    4.026732e+12            2022-03-14 00:24:59   
std    4.145437e+11                            NaN   

                          order_date  user_order_seq  
count                           8773     8773.000000  
mean   2021-08-21 12:47:21.262966016        2.445116  
min              2020-04-30 00:00:00        1.000000  
25%              2021-04-25 00:00:00        1.000000  
50%              2021-10-11 00:00:00        1.000000  
75%              2022-01-03 00:00:00        3.000000  
max              2022-03-14 00:00:00       25.000000  
std                              NaN        2.707693  


### Regulars

Users are allowed to specify items that they wish to buy
regularly. This data gives the items each user has asked to get regularly, along
with when they input that information.

In [89]:
print(df_regulars.shape)
df_regulars.head()

(18105, 3)


Unnamed: 0,user_id,variant_id,created_at
3,68e872ff888303bff58ec56a3a986f77ddebdbe5c279e7...,33618848088196,2020-04-30 15:07:03
11,aed88fc0b004270a62ff1fe4b94141f6b1db1496dbb0c0...,33667178659972,2020-05-05 23:34:35
18,68e872ff888303bff58ec56a3a986f77ddebdbe5c279e7...,33619009208452,2020-04-30 15:07:03
46,aed88fc0b004270a62ff1fe4b94141f6b1db1496dbb0c0...,33667305373828,2020-05-05 23:34:35
47,4594e99557113d5a1c5b59bf31b8704aafe5c7bd180b32...,33667247341700,2020-05-06 14:42:11


### Abandoned cart

If a user has added items to their basket but not
bought them, we capture that information. Items that were abandoned are stored
as a list in item_ids.


In [90]:
print(df_abandoned_carts.shape)
df_abandoned_carts.head()

(5457, 4)


Unnamed: 0,id,user_id,created_at,variant_id
0,12858560217220,5c4e5953f13ddc3bc9659a3453356155e5efe4739d7a2b...,2020-05-20 13:53:24,"[33826459287684, 33826457616516, 3366719212762..."
13,20352449839236,9d6187545c005d39e44d0456d87790db18611d7c7379bd...,2021-06-27 05:24:13,"[34415988179076, 34037940158596, 3450282236326..."
45,20478401413252,e83fb0273d70c37a2968fee107113698fd4f389c442c0b...,2021-07-18 08:23:49,"[34543001337988, 34037939372164, 3411360609088..."
50,20481783103620,10c42e10e530284b7c7c50f3a23a98726d5747b8128084...,2021-07-18 21:29:36,"[33667268116612, 34037940224132, 3443605520397..."
52,20485321687172,d9989439524b3f6fc4f41686d043f315fb408b954d6153...,2021-07-19 12:17:05,"[33667268083844, 34284950454404, 33973246886020]"


### Inventory parquet

Some info about each item id

In [91]:
print(df_inventory.shape)
df_inventory.head()

(1733, 6)


Unnamed: 0,variant_id,price,compare_at_price,vendor,product_type,tags
0,39587297165444,3.09,3.15,heinz,condiments-dressings,"[table-sauces, vegan]"
1,34370361229444,4.99,5.5,whogivesacrap,toilet-roll-kitchen-roll-tissue,"[b-corp, eco, toilet-rolls]"
2,34284951863428,3.69,3.99,plenty,toilet-roll-kitchen-roll-tissue,[kitchen-roll]
3,33667283583108,1.79,1.99,thecheekypanda,toilet-roll-kitchen-roll-tissue,"[b-corp, cruelty-free, eco, tissue, vegan]"
4,33803537973380,1.99,2.09,colgate,dental,[dental-accessories]


### Users

Information about users

In [92]:
print(df_users.shape)
df_users.head()

(4983, 10)


Unnamed: 0,user_id,user_segment,user_nuts1,first_ordered_at,customer_cohort_month,count_people,count_adults,count_children,count_babies,count_pets
2160,0e823a42e107461379e5b5613b7aa00537a72e1b0eaa7a...,Top Up,UKH,2021-05-08 13:33:49,2021-05-01 00:00:00,,,,,
1123,15768ced9bed648f745a7aa566a8895f7a73b9a47c1d4f...,Top Up,UKJ,2021-11-17 16:30:20,2021-11-01 00:00:00,,,,,
1958,33e0cb6eacea0775e34adbaa2c1dec16b9d6484e6b9324...,Top Up,UKD,2022-03-09 23:12:25,2022-03-01 00:00:00,,,,,
675,57ca7591dc79825df0cecc4836a58e6062454555c86c35...,Top Up,UKI,2021-04-23 16:29:02,2021-04-01 00:00:00,,,,,
4694,085d8e598139ce6fc9f75d9de97960fa9e1457b409ec00...,Top Up,UKJ,2021-11-02 13:50:06,2021-11-01 00:00:00,,,,,


In [93]:
def assess_NA(data):
    """
    Returns a pandas dataframe denoting the total number of NA
    values and the percentage of NA values in each column.
    The column names are noted on the index.
    """
    # pandas series denoting features and the sum of their null values
    null_sum = data.isnull().sum()# instantiate columns for missing data
    total = null_sum.sort_values(ascending=False)
    percent = ( ((null_sum / len(data.index))*100).round(2) ).sort_values(ascending=False)
    
    # concatenate along the columns to create the complete dataframe
    df_NA = pd.concat([total, percent], axis=1, sort=True, keys=['Number of NA', 'Percent NA'])

    return df_NA

In [109]:
for df_name, df in dfs.items():
    """
    We will print general info about each df + apply assess_NA
    """
    print(f"--------- FILENAME: df_{df_name}\n")

    print(df.info(), "\n")
    print(df.describe(), "\n")

    result = assess_NA(df)
    print(result.head(),"\n\n")

--------- FILENAME: df_abandoned_carts

<class 'pandas.core.frame.DataFrame'>
Index: 5457 entries, 0 to 70050
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          5457 non-null   int64         
 1   user_id     5457 non-null   object        
 2   created_at  5457 non-null   datetime64[ns]
 3   variant_id  5457 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 213.2+ KB
None 

                 id                     created_at
count  5.457000e+03                           5457
mean   2.161881e+13  2021-12-20 11:07:10.198460672
min    1.285856e+13            2020-05-20 13:53:24
25%    2.133401e+13            2021-11-13 19:52:17
50%    2.167062e+13            2021-12-27 13:14:57
75%    2.192303e+13            2022-01-30 08:35:19
max    2.223385e+13            2022-03-13 14:12:10
std    4.028679e+11                            NaN 

            Number of NA  Perce