# Data Analysis Notebook

This notebook demonstrates how to use the `DataLoader` class from the `load_data.py` script to load and analyze the e-commerce dataset.

In [22]:
# Import Required Libraries and Load Script
import pandas as pd
import sys
from pathlib import Path



# Add the scripts directory to the system path
sys.path.append(str(Path("../scripts").resolve()))

from load_data import DataLoader

# Initialize DataLoader
loader = DataLoader()

In [None]:
# Download Dataset
# Update the dataset identifier to a valid one
# Replace '<correct-username>/<correct-dataset-name>' with the correct identifier
loader.download_dataset(download_path="data/raw")

âœ— Error downloading dataset: 404 Client Error.

Resource not found at URL: https://www.kaggle.com/datasets/bytadit/ecommerce-order-dataset/versions/1
The server reported the following issues: Dataset not found
Please make sure you specified the correct resource identifiers.


In [23]:
# List Files in Dataset
loader.list_files()

Dataset not downloaded yet. Call download_dataset() first.


[]

In [25]:
# Explore DataFrames
for filename, df in dataframes.items():
    print(f"\n{filename}:")
    print(df.head())
    print(df.info())
    print(df.describe())

In [None]:
# Verify CSV files in the raw data folder
import os

raw_data_path = "../data/raw"

# List all files in the test and train subfolders
print("Files in raw data folder (test and train):")
for subfolder in ["test", "train"]:
    folder_path = os.path.join(raw_data_path, subfolder)
    print(f"\n{subfolder} folder:")
    for file in os.listdir(folder_path):
        print(file)

Files in raw data folder:
Ecommerce Order Dataset
.DS_Store
.gitkeep


In [30]:
# Load DataFrames from test and train folders
import pandas as pd

# Define paths for test and train folders
test_folder = "../data/raw/test"
train_folder = "../data/raw/train"

def load_csv_from_folder(folder_path):
    dataframes = {}
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)
            dataframes[file] = df
            print(f"Loaded {file}: {df.shape}")
    return dataframes

# Load test and train data
test_data = load_csv_from_folder(test_folder)
train_data = load_csv_from_folder(train_folder)

# Display first few rows of each DataFrame
for name, df in test_data.items():
    print(f"Test DataFrame: {name}")
    display(df.head())

for name, df in train_data.items():
    print(f"Train DataFrame: {name}")
    display(df.head())

Loaded df_Customers.csv: (38279, 4)
Loaded df_Products.csv: (38279, 6)
Loaded df_Payments.csv: (38279, 5)
Loaded df_Orders.csv: (38279, 4)
Loaded df_OrderItems.csv: (38279, 5)
Loaded df_Orders.csv: (38279, 4)
Loaded df_OrderItems.csv: (38279, 5)
Loaded df_Customers.csv: (89316, 4)
Loaded df_Products.csv: (89316, 6)
Loaded df_Customers.csv: (89316, 4)
Loaded df_Products.csv: (89316, 6)
Loaded df_Payments.csv: (89316, 5)
Loaded df_Payments.csv: (89316, 5)
Loaded df_Orders.csv: (89316, 7)
Loaded df_OrderItems.csv: (89316, 5)
Test DataFrame: df_Customers.csv
Loaded df_Orders.csv: (89316, 7)
Loaded df_OrderItems.csv: (89316, 5)
Test DataFrame: df_Customers.csv


Unnamed: 0,customer_id,customer_zip_code_prefix,customer_city,customer_state
0,I74lXDOfoqsp,6020,goiania,GO
1,47TuLHF2s7X5,23020,viamao,RS
2,dQ0dqI8Qwlj8,75094,campinas,SP
3,iQCmWhNkIczb,89284,santana de parnaiba,SP
4,Dp2g6JH8tO5Z,39810,aripuana,MT


Test DataFrame: df_Products.csv


Unnamed: 0,product_id,product_category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1slxdgbgWFax,toys,50.0,16.0,5.0,11.0
1,77PgsiElQLeB,electronics,200.0,21.0,7.0,14.0
2,QVlD26X1y7NI,furniture_decor,1000.0,100.0,5.0,20.0
3,yWlFGkKYfrpa,toys,8950.0,40.0,30.0,40.0
4,h6MCbrwh5kiC,toys,2301.0,32.0,35.0,34.0


Test DataFrame: df_Payments.csv


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,u6rPMRAYIGig,1,credit_card,2,155.77
1,ohY8f4FEbX19,1,credit_card,1,4.07
2,I28liQek73i2,1,wallet,1,381.59
3,bBG1T89mlY8W,1,credit_card,3,14.76
4,CYxJJSQS8Lbo,1,wallet,1,284.09


Test DataFrame: df_Orders.csv


Unnamed: 0,order_id,customer_id,order_purchase_timestamp,order_approved_at
0,u6rPMRAYIGig,I74lXDOfoqsp,2017-11-18 12:29:57,2017-11-18 12:46:08
1,ohY8f4FEbX19,47TuLHF2s7X5,2018-06-02 17:13:12,2018-06-02 20:12:23
2,I28liQek73i2,dQ0dqI8Qwlj8,2018-01-08 11:01:30,2018-01-09 07:24:03
3,bBG1T89mlY8W,iQCmWhNkIczb,2017-03-10 10:24:46,2017-03-10 10:24:46
4,CYxJJSQS8Lbo,Dp2g6JH8tO5Z,2017-12-02 10:04:07,2017-12-05 04:13:30


Test DataFrame: df_OrderItems.csv


Unnamed: 0,order_id,product_id,seller_id,price,shipping_charges
0,u6rPMRAYIGig,1slxdgbgWFax,3jwvL6ihC45G,24.1,20.9
1,ohY8f4FEbX19,77PgsiElQLeB,GlLj704QXlDB,42.89,12.28
2,I28liQek73i2,QVlD26X1y7NI,V3iKL8r9W9NR,50.21,67.11
3,bBG1T89mlY8W,yWlFGkKYfrpa,RNBdBKsXebna,89.1,62.05
4,CYxJJSQS8Lbo,h6MCbrwh5kiC,5Ja2lH0N2OZt,2139.99,9.41


Train DataFrame: df_Customers.csv


Unnamed: 0,customer_id,customer_zip_code_prefix,customer_city,customer_state
0,hCT0x9JiGXBQ,58125,varzea paulista,SP
1,PxA7fv9spyhx,3112,armacao dos buzios,RJ
2,g3nXeJkGI0Qw,4119,jandira,SP
3,EOEsCQ6QlpIg,18212,uberlandia,MG
4,mVz5LO2Vd6cL,88868,ilhabela,SP


Train DataFrame: df_Products.csv


Unnamed: 0,product_id,product_category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,90K0C1fIyQUf,toys,491.0,19.0,12.0,16.0
1,qejhpMGGVcsl,watches_gifts,440.0,18.0,14.0,17.0
2,qUS5d2pEAyxJ,costruction_tools_garden,2200.0,16.0,16.0,16.0
3,639iGvMyv0De,toys,1450.0,68.0,3.0,48.0
4,1lycYGcsic2F,toys,300.0,17.0,4.0,12.0


Train DataFrame: df_Payments.csv


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,Axfy13Hk4PIk,1,credit_card,1,259.14
1,v6px92oS8cLG,1,credit_card,8,382.39
2,Ulpf9skrhjfm,1,credit_card,4,249.25
3,bwJVWupf2keN,1,credit_card,2,27.79
4,Dd0QnrMk9Cj5,1,credit_card,1,76.15


Train DataFrame: df_Orders.csv


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_timestamp,order_estimated_delivery_date
0,Axfy13Hk4PIk,hCT0x9JiGXBQ,delivered,2017-10-22 18:57:54,2017-10-22 19:14:13,2017-10-26 22:19:52,2017-11-09
1,v6px92oS8cLG,PxA7fv9spyhx,delivered,2018-06-20 21:40:31,2018-06-20 22:20:20,2018-07-03 22:51:22,2018-07-24
2,Ulpf9skrhjfm,g3nXeJkGI0Qw,delivered,2018-02-16 16:19:31,2018-02-17 16:15:35,2018-02-27 01:29:50,2018-03-08
3,bwJVWupf2keN,EOEsCQ6QlpIg,delivered,2018-08-18 18:04:29,2018-08-18 18:15:16,2018-08-27 20:03:51,2018-09-19
4,Dd0QnrMk9Cj5,mVz5LO2Vd6cL,delivered,2017-12-22 16:44:04,2017-12-22 17:31:31,2018-01-05 19:22:49,2018-01-18


Train DataFrame: df_OrderItems.csv


Unnamed: 0,order_id,product_id,seller_id,price,shipping_charges
0,Axfy13Hk4PIk,90K0C1fIyQUf,ZWM05J9LcBSF,223.51,84.65
1,v6px92oS8cLG,qejhpMGGVcsl,IjlpYfhUbRQs,170.8,23.79
2,Ulpf9skrhjfm,qUS5d2pEAyxJ,77p2EYxcM9MD,64.4,17.38
3,bwJVWupf2keN,639iGvMyv0De,jWzS0ayv9TGf,264.5,30.72
4,Dd0QnrMk9Cj5,1lycYGcsic2F,l1pYW6GBnPMr,779.9,30.66


# Data Exploration and Analysis

In this section, we will explore and analyze the test and train datasets. The steps include:
1. Inspecting the data (head, info, describe).
2. Checking for missing values.
3. Checking for duplicates.
4. Visualizing distributions and relationships.
5. Preparing the data for modeling.

In [31]:
# Step 1: Inspect the Data
for name, df in train_data.items():
    print(f"Train DataFrame: {name}")
    print(df.head())
    print(df.info())
    print(df.describe())

for name, df in test_data.items():
    print(f"Test DataFrame: {name}")
    print(df.head())
    print(df.info())
    print(df.describe())

Train DataFrame: df_Customers.csv
    customer_id  customer_zip_code_prefix       customer_city customer_state
0  hCT0x9JiGXBQ                     58125     varzea paulista             SP
1  PxA7fv9spyhx                      3112  armacao dos buzios             RJ
2  g3nXeJkGI0Qw                      4119             jandira             SP
3  EOEsCQ6QlpIg                     18212          uberlandia             MG
4  mVz5LO2Vd6cL                     88868            ilhabela             SP
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89316 entries, 0 to 89315
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               89316 non-null  object
 1   customer_zip_code_prefix  89316 non-null  int64 
 2   customer_city             89316 non-null  object
 3   customer_state            89316 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.7+ MB
None
       customer_z

In [32]:
# Step 2: Check for Missing Values
for name, df in train_data.items():
    print(f"Missing values in Train DataFrame {name}:")
    print(df.isnull().sum())

for name, df in test_data.items():
    print(f"Missing values in Test DataFrame {name}:")
    print(df.isnull().sum())

Missing values in Train DataFrame df_Customers.csv:
customer_id                 0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
Missing values in Train DataFrame df_Products.csv:
product_id                 0
product_category_name    308
product_weight_g          15
product_length_cm         15
product_height_cm         15
product_width_cm          15
dtype: int64
Missing values in Train DataFrame df_Payments.csv:
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64
Missing values in Train DataFrame df_Orders.csv:
order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                   9
order_delivered_timestamp        1889
order_estimated_delivery_date       0
dtype: int64
Missing values in Train DataFrame df_OrderItems.csv:
or

In [33]:
# Step 3: Check for Duplicates
for name, df in train_data.items():
    print(f"Duplicates in Train DataFrame {name}: {df.duplicated().sum()}")

for name, df in test_data.items():
    print(f"Duplicates in Test DataFrame {name}: {df.duplicated().sum()}")

Duplicates in Train DataFrame df_Customers.csv: 0
Duplicates in Train DataFrame df_Products.csv: 61865
Duplicates in Train DataFrame df_Payments.csv: 0
Duplicates in Train DataFrame df_Orders.csv: 0
Duplicates in Train DataFrame df_OrderItems.csv: 0
Duplicates in Test DataFrame df_Customers.csv: 0
Duplicates in Test DataFrame df_Products.csv: 21675
Duplicates in Test DataFrame df_Payments.csv: 0
Duplicates in Test DataFrame df_Orders.csv: 0
Duplicates in Test DataFrame df_OrderItems.csv: 0


In [34]:
# Step 4: Visualize Distributions and Relationships
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Visualize distributions of numerical columns
for name, df in train_data.items():
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numerical_columns:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[col], kde=True)
        plt.title(f"Distribution of {col} in Train DataFrame {name}")
        plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Step 5: Prepare Data for Modeling
from sklearn.model_selection import train_test_split

# Example: Split train data into training and validation sets
for name, df in train_data.items():
    if 'target_column' in df.columns:  # Replace 'target_column' with the actual target column name
        X = df.drop(columns=['target_column'])
        y = df['target_column']
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        print(f"Split Train DataFrame {name} into training and validation sets.")