In [1]:
import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px

In [2]:
prior_product_orders = pl.read_csv("/kaggle/input/instacart-market-basket-analysis/order_products__prior.csv")
train_product_orders = pl.read_csv("/kaggle/input/instacart-market-basket-analysis/order_products__train.csv")
departments_df = pl.read_csv("/kaggle/input/instacart-market-basket-analysis/departments.csv")
aisels_df = pl.read_csv("/kaggle/input/instacart-market-basket-analysis/aisles.csv")
orders_df = pl.read_csv("/kaggle/input/instacart-market-basket-analysis/orders.csv")
products_df = pl.read_csv("/kaggle/input/instacart-market-basket-analysis/products.csv")

In [3]:
df_list = [prior_product_orders,orders_df,train_product_orders,products_df,aisels_df,departments_df]

In [4]:
prior_product_orders.schema

Schema([('order_id', Int64),
        ('product_id', Int64),
        ('add_to_cart_order', Int64),
        ('reordered', Int64)])

In [5]:
orders_df.schema

Schema([('order_id', Int64),
        ('user_id', Int64),
        ('eval_set', String),
        ('order_number', Int64),
        ('order_dow', Int64),
        ('order_hour_of_day', Int64),
        ('days_since_prior_order', Float64)])

In [6]:
aisels_df.schema

Schema([('aisle_id', Int64), ('aisle', String)])

In [7]:
products_df.schema

Schema([('product_id', Int64),
        ('product_name', String),
        ('aisle_id', Int64),
        ('department_id', Int64)])

In [8]:
# let's check for null values.
for df in df_list:
    print(df.null_count())

# as we can see only days_since_prior_order contains null as first order hasn't any prior orders.

shape: (1, 4)
┌──────────┬────────────┬───────────────────┬───────────┐
│ order_id ┆ product_id ┆ add_to_cart_order ┆ reordered │
│ ---      ┆ ---        ┆ ---               ┆ ---       │
│ u32      ┆ u32        ┆ u32               ┆ u32       │
╞══════════╪════════════╪═══════════════════╪═══════════╡
│ 0        ┆ 0          ┆ 0                 ┆ 0         │
└──────────┴────────────┴───────────────────┴───────────┘
shape: (1, 7)
┌──────────┬─────────┬──────────┬──────────────┬───────────┬───────────────────┬───────────────────┐
│ order_id ┆ user_id ┆ eval_set ┆ order_number ┆ order_dow ┆ order_hour_of_day ┆ days_since_prior_ │
│ ---      ┆ ---     ┆ ---      ┆ ---          ┆ ---       ┆ ---               ┆ order             │
│ u32      ┆ u32     ┆ u32      ┆ u32          ┆ u32       ┆ u32               ┆ ---               │
│          ┆         ┆          ┆              ┆           ┆                   ┆ u32               │
╞══════════╪═════════╪══════════╪══════════════╪═══════════╪═

In [9]:
# let's check for the label consistency 

print(prior_product_orders['reordered'].n_unique())
print(train_product_orders['reordered'].n_unique())

2
2


In [10]:
# let's check differenct eval_set

orders_df['eval_set'].unique()

eval_set
str
"""test"""
"""train"""
"""prior"""


In [11]:
# let's check consistency of the order_df
print(orders_df['order_dow'].unique())
orders_df['order_hour_of_day'].unique()

shape: (7,)
Series: 'order_dow' [i64]
[
	0
	1
	2
	3
	4
	5
	6
]


order_hour_of_day
i64
0
1
2
3
4
…
19
20
21
22


In [12]:
# days_since_prior_orders is also looking convenient

orders_df['days_since_prior_order'].max()

30.0