In [1]:
!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.



-   An order might have multiple items.
-   Each item might be fulfilled by a distinct seller.
-   All text identifying stores and partners where replaced by the names of Game of Thrones great houses.

![](https://i.imgur.com/HRhd2Y0.png)


In [2]:
import pandas as pd


In [3]:
customers_df = pd.read_csv(
    "../data/raw/olist_customers_dataset.csv",
    dtype={
        # Nominal qualitative data
        "customer_id": "category",
        "customer_unique_id": "category",
        "customer_city": "category",
        "customer_state": "category",
        "customer_zip_code_prefix": "category",
    },
)
geolocation_df = pd.read_csv(
    "../data/raw/olist_geolocation_dataset.csv",
    dtype={
        # Nominal qualitative data
        "geolocation_zip_code_prefix": "category",
        "geolocation_city": "category",
        "geolocation_state": "category",
        # Continuous quantitative data
        "geolocation_lat": float,
        "geolocation_lng": float,
    },
)
order_items_df = pd.read_csv(
    "../data/raw/olist_order_items_dataset.csv",
    dtype={
        # Nominal qualitative data
        "order_id": "category",
        "order_item_id": "category",
        "product_id": "category",
        "seller_id": "category",
        # Date data
        "shipping_limit_date": str,
        # Continuous quantitative data
        "price": float,
        "freight_value": float,
    },
    parse_dates=["shipping_limit_date"],
)
order_payments_df = pd.read_csv(
    "../data/raw/olist_order_payments_dataset.csv",
    dtype={
        # Nominal qualitative data
        "order_id": "category",
        "payment_type": "category",
        # Discrete quantitative data
        "payment_sequential": int,
        "payment_installments": int,
        # Continuous quantitative data
        "payment_value": float,
    },
)
order_reviews_df = pd.read_csv(
    "../data/raw/olist_order_reviews_dataset.csv",
    dtype={
        # Nominal qualitative data
        "review_id": "category",
        "order_id": "category",
        # Discrete quantitative data
        "review_score": int,
        # Text data
        "review_comment_title": str,
        "review_comment_message": str,
        # Date data
        "review_creation_date": str,
        "review_answer_timestamp": str,
    },
    parse_dates=["review_creation_date", "review_answer_timestamp"],
)
orders_df = pd.read_csv(
    "../data/raw/olist_orders_dataset.csv",
    dtype={
        # Nominal qualitative data
        "order_id": "category",
        "customer_id": "category",
        "order_status": "category",
        # Date data
        "order_purchase_timestamp": str,
        "order_approved_at": str,
        "order_delivered_carrier_date": str,
        "order_delivered_customer_date": str,
        "order_estimated_delivery_date": str,
    },
    parse_dates=[
        "order_purchase_timestamp",
        "order_approved_at",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_estimated_delivery_date",
    ],
)
products_df = pd.read_csv(
    "../data/raw/olist_products_dataset.csv",
    dtype={
        # Nominal qualitative data
        "product_id": "category",
        "product_category_name": "category",
        # Discrete quantitative data
        # Nullable : https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#support-for-integer-na
        "product_name_lenght": pd.Int64Dtype(),
        "product_description_lenght": pd.Int64Dtype(),
        "product_photos_qty": pd.Int64Dtype(),
        # Continuous quantitative data
        "product_weight_g": float,
        "product_length_cm": float,
        "product_height_cm": float,
        "product_width_cm": float,
    },
)
sellers_df = pd.read_csv(
    "../data/raw/olist_sellers_dataset.csv",
    dtype={
        # Nominal qualitative data
        "seller_id": "category",
        "seller_city": "category",
        "seller_state": "category",
        "seller_zip_code_prefix": "category",
    },
)
category_translation_df = pd.read_csv(
    "../data/raw/product_category_name_translation.csv"
)


In [4]:
sellers_df = (
    sellers_df.merge(
        geolocation_df,
        how="left",
        left_on="seller_zip_code_prefix",
        right_on="geolocation_zip_code_prefix",
    )
    .drop(
        columns=[
            "geolocation_zip_code_prefix",
            "geolocation_city",
            "geolocation_state",
        ],
    )
    .rename(
        columns={
            "geolocation_lat": "seller_lat",
            "geolocation_lng": "seller_lng",
        },
    )
    .drop_duplicates()
    .groupby("seller_id")
    .agg(
        {
            "seller_zip_code_prefix": {
                "first",
                "last",
                "count",
                "nunique",
                "max",
                "min",
            },
            "seller_lat": {
                "first",
                "last",
                "count",
                "nunique",
                "max",
                "min",
                "mean",
                "median",
                "std",
            },
            "seller_lng": {
                "first",
                "last",
                "count",
                "nunique",
                "max",
                "min",
                "mean",
                "median",
                "std",
            },
        }
    )
    .reset_index()
)

sellers_df.head()


Unnamed: 0_level_0,seller_id,seller_zip_code_prefix,seller_zip_code_prefix,seller_zip_code_prefix,seller_zip_code_prefix,seller_zip_code_prefix,seller_zip_code_prefix,seller_lat,seller_lat,seller_lat,seller_lat,seller_lat,seller_lng,seller_lng,seller_lng,seller_lng,seller_lng,seller_lng,seller_lng,seller_lng,seller_lng
Unnamed: 0_level_1,Unnamed: 1_level_1,count,nunique,min,last,max,first,count,min,nunique,...,first,count,min,nunique,mean,last,median,max,std,first
0,0015a82c2db000af6aaaf3ae2ecb0532,139,1,9080,9080,9080,9080,139,-23.655752,139,...,-23.644439,139,-46.549683,139,-46.542293,-46.544876,-46.543113,-46.532709,0.004266,-46.539885
1,001cca7ae9ae17fb1caed9dfb1094831,43,1,29156,29156,29156,29156,43,-20.365605,43,...,-20.297537,43,-40.431168,43,-40.411514,-40.395287,-40.415232,-40.390723,0.010701,-40.400869
2,001e6ad469a905060d959994f1b41e4f,39,1,24754,24754,24754,24754,39,-22.879228,39,...,-22.87404,39,-43.036859,39,-43.027422,-43.019854,-43.027384,-43.019154,0.005532,-43.033089
3,002100f778ceb8431b7a1020ff7ab48f,239,1,14405,14405,14405,14405,239,-20.55099,239,...,-20.506794,239,-47.429739,239,-47.411287,-47.410797,-47.410958,-47.400131,0.005181,-47.407738
4,003554e2dce176b5555353e4f3555ac8,34,1,74565,74565,74565,74565,34,-16.671984,34,...,-16.648412,34,-49.422864,34,-49.281838,-49.214339,-49.274016,-49.198478,0.040235,-49.311191


In [5]:

sellers_df = sellers_df.rename(
    columns={
        "geolocation_lat": "seller_lat",
        "geolocation_lng": "seller_lng",
    },
)
sellers_df = sellers_df.drop_duplicates()
sellers_df = sellers_df.groupby("seller_id")
sellers_df = sellers_df.agg(
    {"seller_lat": "mean", "seller_lng": "mean"}
)
sellers_df = sellers_df.reset_index()


sellers_df.head()


KeyError: "Column(s) ['seller_lat', 'seller_lng'] do not exist"

In [None]:
order_items_df = order_items_df.merge(
    sellers_df,
    how="left",
    left_on="seller_id",
    right_on="seller_id",
)

order_items_df = order_items_df.merge(
    products_df,
    how="left",
    left_on="product_id",
    right_on="product_id",
)

order_items_df.head()


In [None]:
order_items_df.head(20)


In [None]:
customers_df = customers_df.merge(
    geolocation_df,
    how="left",
    left_on="customer_zip_code_prefix",
    right_on="geolocation_zip_code_prefix",
    validate="m:1",
)
