In [11]:
import pandas as pd


You are a Data Analyst on the **Walmart.com** Insights team investigating customer return patterns. The team aims to develop a predictive approach to understanding customer return behaviors across different time periods. Your goal is to leverage transaction data to create a comprehensive view of customer return likelihood.

In [12]:
# Load the data
customer_returns = pd.read_csv('customer_returns.csv')

# Display the dataset
print(customer_returns)


   order_id  order_date customer_id  return_flag  order_amount
0   ORD0001  2024-07-05     CUST001         True         120.5
1   ORD0002  2024-07-10     CUST002        False          75.0
2   ORD0003  2024-08-15     CUST001         True          90.0
3   ORD0004  2024/09/01     CUST003        False          45.0
4   ORD0005  2024-10-20     CUST004         True         200.0
5   ORD0006  2024-11-11     CUST002         True           NaN
6   ORD0007  2024-11-15     CUST005        False          60.0
7   ORD0008  2024-12-05     CUST006         True         150.0
8   ORD0009  2024-12-25     CUST007        False          85.0
9   ORD0010  2025-01-10     CUST001         True         130.0
10  ORD0011  2025-01-15     CUST008        False          50.0
11  ORD0012  2025-02-10     CUST009         True         110.0
12  ORD0013  2025-02-14     CUST010        False         100.0
13  ORD0014  2025-03-03     CUST005         True          77.5
14  ORD0015         NaN     CUST002        False       

In [13]:
# Ensure order_date is in datetime format
customer_returns['order_date'] = pd.to_datetime(customer_returns['order_date'], errors='coerce')

# Display the dataset
print(customer_returns)


   order_id order_date customer_id  return_flag  order_amount
0   ORD0001 2024-07-05     CUST001         True         120.5
1   ORD0002 2024-07-10     CUST002        False          75.0
2   ORD0003 2024-08-15     CUST001         True          90.0
3   ORD0004        NaT     CUST003        False          45.0
4   ORD0005 2024-10-20     CUST004         True         200.0
5   ORD0006 2024-11-11     CUST002         True           NaN
6   ORD0007 2024-11-15     CUST005        False          60.0
7   ORD0008 2024-12-05     CUST006         True         150.0
8   ORD0009 2024-12-25     CUST007        False          85.0
9   ORD0010 2025-01-10     CUST001         True         130.0
10  ORD0011 2025-01-15     CUST008        False          50.0
11  ORD0012 2025-02-10     CUST009         True         110.0
12  ORD0013 2025-02-14     CUST010        False         100.0
13  ORD0014 2025-03-03     CUST005         True          77.5
14  ORD0015        NaT     CUST002        False          65.0
15  ORD0

### Question 1 of 2

Identify and list all unique customer IDs who have made returns between July 1st 2024 and June 30th 2025. This will help us understand the base set of customers involved in returns during the specified period.

In [14]:
# Filter returns in the target window (inclusive)
start_date = pd.Timestamp('2024-07-01')
end_date = pd.Timestamp('2025-06-30')

# Ensure datetime already converted earlier
mask = (
    customer_returns['return_flag'].astype(bool) &
    customer_returns['order_date'].between(start_date, end_date, inclusive='both')
)

# Get unique customer IDs with returns in the specified window
unique_return_customers = (
    customer_returns.loc[mask, 'customer_id']
    .dropna()
    .drop_duplicates()
    .sort_values()
    .tolist()
)

# Display the results
print(f"Number of unique customers with returns in window: {len(unique_return_customers)}")
print("Customer IDs:")
print(unique_return_customers)


Number of unique customers with returns in window: 8
Customer IDs:
['CUST001', 'CUST002', 'CUST003', 'CUST004', 'CUST005', 'CUST006', 'CUST007', 'CUST009']


### Question 2 of 3

Convert the `order_date` column to a datetime format and create a MultiIndex with `customer_id` and `order_date`. Then, calculate the total number of returns per customer for each month. This will provide insights into monthly return patterns for each customer.

In [15]:
# `order_date` was already converted to datetime
# Create a MultiIndex (`customer_id`, `order_date`) for the full dataset (keeps original variable name)
multi_index_df = (
    customer_returns
    .set_index(['customer_id', 'order_date'])
    .sort_index()
)

print("MultiIndex created with levels:", multi_index_df.index.names)

# Filter to returned orders only
returned_orders = multi_index_df[multi_index_df['return_flag'].astype(bool)]

# Aggregate: count returns per customer per month (month start as the period label)
monthly_returns = (
    returned_orders
    .groupby([returned_orders.index.get_level_values('customer_id'),
              pd.Grouper(level='order_date', freq='MS')])
    .size()
    .rename('monthly_returns')
    .to_frame()
)

print("\nMonthly returns (long format):")
print(monthly_returns)

# Optional wide (pivot) view: customers as rows, months as columns
monthly_returns_wide = (
    monthly_returns
    .reset_index()
    .pivot(index='customer_id', columns='order_date', values='monthly_returns')
    .fillna(0)
    .astype(int)
    .sort_index()
)

print("\nMonthly returns (wide pivot):")
print(monthly_returns_wide.head())


MultiIndex created with levels: ['customer_id', 'order_date']

Monthly returns (long format):
                        monthly_returns
customer_id order_date                 
CUST001     2024-07-01                2
            2024-08-01                1
            2025-01-01                2
            2025-03-01                1
            2025-06-01                1
CUST002     2024-11-01                1
CUST003     2024-09-01                1
            2025-03-01                2
            2025-04-01                1
            2025-06-01                1
CUST004     2024-10-01                1
CUST005     2024-08-01                1
            2024-11-01                1
            2025-03-01                1
            2025-05-01                1
            2025-06-01                1
CUST006     2024-12-01                1
            2025-04-01                1
CUST007     2024-08-01                1
            2024-10-01                1
            2025-01-01    