In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# **0. Data Loading**

We will load the preprocessed dataset that was prepared earlier in the Jupyter Notebook.

In [8]:
# Define the base folder path
folder_path = os.path.join("dataset", "02_clean_dataset","seller_side_clean.csv")

seller_df = pd.read_csv(folder_path,
                        parse_dates=['order_purchase_timestamp', 'order_approved_at', 'shipping_limit_date', 'order_delivered_carrier_date', 
                                     'order_delivered_customer_date', 'order_estimated_delivery_date'])


# Rename dehashed id columns for easier indexing
seller_df.rename(columns={"seller_id_new":"seller_id",
                          "order_id_new":"order_id",
                          "product_id_new":"product_id"}, inplace=True)

print("✅ Data loaded successfully!")

seller_df.head()

✅ Data loaded successfully!


Unnamed: 0,seller_id,order_id,order_item_id,product_id,product_category_name_english,price,order_purchase_timestamp,order_approved_at,shipping_limit_date,order_status,...,review_score,net_profit,reviewed,late_days_to_carrier,is_late_to_carrier,seller_zip_code_prefix,seller_city,seller_state,geolocation_lat,geolocation_lng
0,seller_513,order_id_85267,1,product_id_25865,cool_stuff,58.9,2017-09-13 08:59:02,2017-09-13 09:45:35.000,2017-09-19 09:45:35,delivered,...,5.0,10.602,True,0.0,False,27277,volta redonda,SP,-22.496953,-44.127492
1,seller_471,order_id_71853,1,product_id_27230,pet_shop,239.9,2017-04-26 10:53:06,2017-04-26 11:05:13.000,2017-05-03 11:05:13,delivered,...,4.0,43.182,True,1.0,True,3471,sao paulo,SP,-23.565096,-46.518565
2,seller_1824,order_id_6298,1,product_id_22624,furniture_decor,199.0,2018-01-14 14:33:31,2018-01-14 14:48:30.000,2018-01-18 14:48:30,delivered,...,5.0,35.82,True,-2.0,False,37564,borda da mata,MG,-22.262584,-46.171124
3,seller_2023,order_id_22550,1,product_id_15403,perfumery,12.99,2018-08-08 10:00:35,2018-08-08 10:10:18.000,2018-08-15 10:10:18,delivered,...,4.0,2.3382,True,-5.0,False,14403,franca,SP,-20.553624,-47.387359
4,seller_1597,order_id_5247,1,product_id_8862,garden_tools,199.9,2017-02-04 13:57:51,2017-02-04 14:10:13.000,2017-02-13 13:57:51,delivered,...,5.0,35.982,True,3.0,True,87900,loanda,PR,-22.929384,-53.135873


In [9]:
pd.DataFrame({
    'total': seller_df.shape[0],
    'null_count': seller_df.isna().sum(),
    'null_pct':   [f"{val:.2f}%" for val in (seller_df.isna().sum() / seller_df.shape[0] * 100)],
    'unique_count': seller_df.nunique(),
    'dtype': seller_df.dtypes
}).rename_axis('column')

Unnamed: 0_level_0,total,null_count,null_pct,unique_count,dtype
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
seller_id,112644,0,0.00%,3095,object
order_id,112644,0,0.00%,98663,object
order_item_id,112644,0,0.00%,21,int64
product_id,112644,0,0.00%,32949,object
product_category_name_english,112644,0,0.00%,72,object
price,112644,0,0.00%,5968,float64
order_purchase_timestamp,112644,0,0.00%,98109,datetime64[ns]
order_approved_at,112644,0,0.00%,92960,object
shipping_limit_date,112644,0,0.00%,93799,datetime64[ns]
order_status,112644,0,0.00%,7,object


# **1. Introduction**

In our previous analysis of Olist's business ecosystem, we examined data from business, buyer, and seller perspectives. **Olist generates revenue primarily through sales commissions**, with an interesting customer dynamic: while buyers are numerous, they typically make only one purchase, whereas sellers—though fewer in number—generate higher per-entity profit through both sales commissions and platform subscription fees.

However, like many service-based businesses, sellers—whom we consider our primary customers in this context—may stop using Olist's services over time, either voluntarily or involuntarily. This phenomenon, known as *churn*, is often linked to their satisfaction with the platform.

Building and maintaining strong customer relationships is critical for business success. A key component of customer relationship management (CRM) is proactive churn management, which involves identifying customers at risk of leaving and implementing targeted retention strategies. **In this notebook, we will prepare a churn dataset for subsequent analysis and modeling in the next notebook**.


## 1.1 **Understanding Churn**

Churn is typically defined as the event where a customer stops using a company's product or service. For business that mainly profits from subscription, churn often refers to customers failing to renew their subscriptions after a certain period. However, the definition of churn varies depending on the business model and revenue structure.

For example:
- Churn can be defined as *recurring*, where a customer temporarily stops using the service but later returns (e.g., re-subscribing after a break).
- It can also defined as the permanent end of a customer's engagement, such as the termination of a contract.
- In businesses where revenue is primarily derived from non-subscription sources (e.g., sales commissions, purchases), churn may be defined as a customer ceasing activity on the platform after a period of time.

### 1.1.1 **Defining Churn for Olist**

In Olist's case, revenue is primarily driven by sales commissions. Therefore, we define churn as a seller ceasing sales-related activity on the platform for an extended period. Based on our available data and information:
- Only order purchases provide information on sales activity
- When a seller delivers an order to Olist's logistics partner, the order status is automatically updated without the seller's intervention, and delivery statuses are tracked in real time.
- The key indicator of sales activity is a seller approving an order.

We assume churn can be recurring in this context. For instance:
- Sellers may become inactive simply due to a lack of buyer purchases on Olist stores on other platforms.
- Inactivity could also stem from temporary supply chain or logistics challenges that prevent sellers from fulfilling orders.

To quantify churn, we define it as a seller being inactive (no sales activity) in the quarter following their last active quarter. This approach aligns with research by Gattermann-Itschert and Thonemann (2022) and Mirkovic et al. (2022), who used a quarterly period to define churn in a related, recurring B2B settings.

**Prolonged sales inactivity often signals underlying issues—whether related to platform satisfaction, business challenges, or competitive pressures**. We aim to create a framework will help us identify at-risk sellers (especially high-value sellers who are hard to replace and contribute significantly to Olist's revenue). By detecting these at-risk sellers early, we can deploy personalized outreach programs, offer support/marketing resources, or implement retention incentives to address seller concerns and maintain these valuable business relationships.


### 1.1.2 **Why Quarterly Churn?**

We use quarterly churn instead of monthly churn for several reasons:
1. **Seasonality in E-commerce**: Monthly data is more susceptible to seasonal fluctuations, which are common in e-commerce. Quarterly data tends to smooth out these variations, providing a more stable measure of churn for prediction (Gold, *Fighting Churn with Data*).
2. **Alignment with Fiscal Periods**: Quarterly periods align with fiscal year reporting, making it easier to integrate churn analysis with broader business metrics and planning. 

### Sources:
- Gattermann-Itschert, T., & Thonemann, U. (2022). Proactive customer retention management in a non-contractual B2B setting based on churn prediction with random forests.  
- Mirkovic, M., Vučković, T., Stefanović, D., Anderla, A., & Gracanin, D. (2022). Customer churn prediction in B2B non-contractual business settings using invoice data. *Applied Sciences, 12*(10), 5001. https://doi.org/10.3390/app12105001.  
- Gold, C. S. (n.d.). *Fighting Churn with Data*.  


# 2. **Churn Label Preparation**


In this section, we'll begin preparing our dataset. Our first step is to convert the `order_approved_at` timestamps into `datetime64` format, create a quarterly period version of it, and initialize our churn dataset. Next, we will create  a snapshot of each active seller's behavior for each quarter, where each row represents an active seller in each quarter:

In [10]:
# Convert order_approved_at to quarterly periods for churn dataset
# 'Q-DEC' specifies quarters ending in December (e.g., Q1 ends in March, Q4 ends in December)
seller_df['order_approved_at'] = seller_df['order_approved_at'].astype('datetime64[ns]')
seller_df['order_approved_at_yearquarter'] = pd.PeriodIndex(seller_df['order_approved_at'], freq='Q-DEC')


churn_df = seller_df[['order_approved_at_yearquarter', 'seller_id']].drop_duplicates()
churn_df.dropna(inplace=True)
churn_df.rename(columns={"order_approved_at_yearquarter":"seller_active_quarter"},inplace=True)
churn_df

Unnamed: 0,seller_active_quarter,seller_id
0,2017Q3,seller_513
1,2017Q2,seller_471
2,2018Q1,seller_1824
3,2018Q3,seller_2023
4,2017Q1,seller_1597
...,...,...
112431,2017Q2,seller_687
112547,2017Q2,seller_1446
112572,2018Q2,seller_2082
112595,2017Q1,seller_1526


Our dataset contains records of **active sellers for each quarter**. Each seller-quarter pair is unique. 

Note that a single seller may appear multiple times in the dataset if they were active across different quarters. This temporal dimension is crucial for our churn analysis.

In [11]:
churn_df.query("seller_id=='seller_2682'")

Unnamed: 0,seller_active_quarter,seller_id
63,2017Q1,seller_2682
23645,2017Q2,seller_2682
35805,2017Q3,seller_2682


In [12]:
churn_df.query("seller_id=='seller_20'")

Unnamed: 0,seller_active_quarter,seller_id
4076,2017Q4,seller_20
15467,2018Q1,seller_20


To identify whether a seller churned in the subsequent quarter, we need to check if they remained active in the subsequent quarter. 

Let's walk through this process with a concrete example. For `seller_20` during `2017Q4`, we want to determine if they continued to be active in `2018Q1` (the next quarter):

In [13]:
# Create a set of tuples (seller_id, quarter) representing all active seller-quarter combinations
# This allows for efficient lookup operations
seller_quarter_set = set(zip(churn_df['seller_id'], churn_df['seller_active_quarter'].astype('str')))

# Check if seller_20 was inactive in 2018Q1
("seller_20", '2018Q1') not in seller_quarter_set

False

The result is `False`, which means `seller_20` remained active in `2018Q1` after being active in `2017Q4`. Therefore, this seller did **not churn** in the next quarter.

Let's examine another case: Was `seller_20` (who was active in `2018Q1`) still active in the following quarter (`2018Q2`)?


In [14]:
# Check if seller_20 was inactive in 2018Q2
("seller_20", '2018Q2') not in seller_quarter_set

True

The result is `True`, indicating that `seller_20` was **not active** in `2018Q2` despite being active in `2018Q1`. This means `seller_20` **churned** after `2018Q1`.

Using this logic, we can now systematically label the churn status for all sellers across all quarters:


In [15]:
# Identify the most recent quarter in our dataset
latest_quarter = churn_df['seller_active_quarter'].max()

# Calculate the observation quarter (next quarter after active quarter)
churn_df['observation_quarter'] = churn_df['seller_active_quarter'].apply(lambda x: x + 1)

# Create a set of (seller_id, seller_active_quarter) for quick lookup
# This represents all seller-quarter combinations where the seller was active
churn_df_set = set(zip(churn_df['seller_id'], churn_df['seller_active_quarter']))

# Determine churn status for each seller-quarter combination:
# - NaN: Cannot determine churn (observation quarter is beyond our data/incomplete)
# - True: Seller churned (not active in the observation quarter)
# - False: Seller did not churn (remained active in the observation quarter)
churn_df['is_churn'] = churn_df.apply(
    lambda row: np.nan if row['observation_quarter'] >= latest_quarter 
                else not (row['seller_id'], row['observation_quarter']) in churn_df_set, 
                axis=1
)

# Select relevant columns for our analysis
churn_df = churn_df[['seller_active_quarter', 'observation_quarter', 'seller_id', 'is_churn']]

churn_df

Unnamed: 0,seller_active_quarter,observation_quarter,seller_id,is_churn
0,2017Q3,2017Q4,seller_513,False
1,2017Q2,2017Q3,seller_471,False
2,2018Q1,2018Q2,seller_1824,False
3,2018Q3,2018Q4,seller_2023,
4,2017Q1,2017Q2,seller_1597,False
...,...,...,...,...
112431,2017Q2,2017Q3,seller_687,True
112547,2017Q2,2017Q3,seller_1446,False
112572,2018Q2,2018Q3,seller_2082,
112595,2017Q1,2017Q2,seller_1526,False


We can verify our automated labeling works by examining the churn status for `seller_20`, which should match our manual calculations from earlier:


In [16]:
churn_df.query("seller_id=='seller_20'")

Unnamed: 0,seller_active_quarter,observation_quarter,seller_id,is_churn
4076,2017Q4,2018Q1,seller_20,False
15467,2018Q1,2018Q2,seller_20,True


# 3. **Churn Feature Preparation**

After creating our churn labels for sellers by quarter, we now need to generate meaningful metrics for each observation. These metrics will help us understand seller behavior patterns that might correlate with churn.

First, we'll convert timestamp data in `sellers_df` into quarterly periods. This standardization will allow us to aggregate metrics by quarter and join them later with our `churn_df` dataset.


In [17]:
# Convert date columns to quarterly periods for aggregation and merging
# 'Q-DEC' specifies quarters ending in December (e.g., Q1 ends in March, Q4 ends in December)
seller_df['order_purchase_yearquarter'] = pd.PeriodIndex(seller_df['order_purchase_timestamp'], freq='Q-DEC')
seller_df['shipping_limit_yearquarter'] = pd.PeriodIndex(seller_df['shipping_limit_date'], freq='Q-DEC')
seller_df['order_delivered_carrier_yearquarter'] = pd.PeriodIndex(seller_df['order_delivered_carrier_date'], freq='Q-DEC')
seller_df['order_delivered_customer_yearquarter'] = pd.PeriodIndex(seller_df['order_delivered_customer_date'], freq='Q-DEC')
seller_df['order_estimated_delivery_yearquarter'] = pd.PeriodIndex(seller_df['order_estimated_delivery_date'], freq='Q-DEC')


Next, we'll also calculate performance metrics that could influence seller churn. 

We'll start with delivery performance by identifying late deliveries and order processing time in `seller_df`:
- **`is_late_to_customer`**: A binary indicator (`True`/`False`) to identify if an order was delivered to the customer after the estimated delivery date.

- **`time_to_approved`**: The time taken (in days) for an order to be approved after being placed.


In [18]:
seller_df['is_late_to_customer'] = (seller_df['order_delivered_customer_date'] > seller_df['order_estimated_delivery_date'])
seller_df['is_late_to_customer']

0         False
1         False
2         False
3         False
4         False
          ...  
112639    False
112640    False
112641    False
112642    False
112643    False
Name: is_late_to_customer, Length: 112644, dtype: bool

In [19]:
seller_df['time_to_approved'] = seller_df['order_approved_at'] - seller_df['order_purchase_timestamp']
seller_df['time_to_approved']

0        0 days 00:46:33
1        0 days 00:12:07
2        0 days 00:14:59
3        0 days 00:09:43
4        0 days 00:12:22
               ...      
112639   1 days 14:13:55
112640   2 days 18:05:02
112641   1 days 00:06:29
112642   0 days 01:01:33
112643   0 days 00:09:55
Name: time_to_approved, Length: 112644, dtype: timedelta64[ns]

## 3.1 **Metric Calculation Demonstration**

### **3.1.1 Example 1: Number of Orders per Quarter**
Suppose we want to calculate the number of orders a seller receives in a given quarter, regardless of whether the orders were approved.

We'll compute this by grouping `seller_df` by quarter and `seller_id`, calculating the number of unique `order_id`. However, we want to group by the purchase quarter rather than approve quarter.

This is because an order might be approved in a different quarter than when it was placed (e.g., an order placed at the end of Q1 might be approved in Q2).

In [20]:
# Aggregate the number of unique orders per seller per quarter
orders = seller_df.groupby(["order_purchase_yearquarter", "seller_id"]).agg(n_orders=("order_id", "nunique")).reset_index()
orders.head()

Unnamed: 0,order_purchase_yearquarter,seller_id,n_orders
0,2016Q4,seller_1023,5
1,2016Q4,seller_1030,1
2,2016Q4,seller_115,2
3,2016Q4,seller_1162,1
4,2016Q4,seller_1191,2


Now we can integrate this order volume metric into our churn dataset by matching on `seller_id` and the relevant quarter:


In [21]:
# Merge the orders metric into the churn dataset
# Left join ensures all sellers in churn_df are kept, even if they have no orders
churn_with_orders = churn_df.merge(orders, 
                                   left_on=["seller_active_quarter", "seller_id"], 
                                   right_on=["order_purchase_yearquarter", "seller_id"], 
                                   how="left")\
                                    .drop(columns="order_purchase_yearquarter")

# Fill missing order counts with 0 (for sellers with no orders in a quarter) and convert to integer
churn_with_orders["n_orders"] = churn_with_orders["n_orders"].fillna(0).astype('int64')

churn_with_orders

Unnamed: 0,seller_active_quarter,observation_quarter,seller_id,is_churn,n_orders
0,2017Q3,2017Q4,seller_513,False,65
1,2017Q2,2017Q3,seller_471,False,12
2,2018Q1,2018Q2,seller_1824,False,5
3,2018Q3,2018Q4,seller_2023,,3
4,2017Q1,2017Q2,seller_1597,False,5
...,...,...,...,...,...
8569,2017Q2,2017Q3,seller_687,True,1
8570,2017Q2,2017Q3,seller_1446,False,1
8571,2018Q2,2018Q3,seller_2082,,1
8572,2017Q1,2017Q2,seller_1526,False,1


### **3.1.2 Example 2: Sales per Quarter**
Now, suppose we want to calculate another important metric: the total sales, revenue a seller generates (before commission cut) in a given quarter. 

Unlike the number of orders, sales should be based on the quarter in which the order was **delivered to the customer** (`order_delivered_customer_yearquarter`). 

This is because revenue is typically recognized only after delivery, not at the time of purchase or approval.

In [22]:
# Aggregate total sales (sum of prices) per seller per quarter of delivery
sales = seller_df.groupby(["order_delivered_customer_yearquarter", "seller_id"]).agg(sales=("price", "sum")).reset_index()
sales.head()

Unnamed: 0,order_delivered_customer_yearquarter,seller_id,sales
0,2016Q4,seller_1023,1344.2
1,2016Q4,seller_1030,107.99
2,2016Q4,seller_115,199.39
3,2016Q4,seller_1162,19.9
4,2016Q4,seller_1191,133.8


Similar to the previous case, we merge the total sales metric into our churn dataset using a **left join**. 

This ensures all sellers in the churn dataset are retained, with sales set to 0 for quarters where no sales were recorded.

In [23]:
# Merge the sales metric into the churn dataset
churn_with_sales = churn_with_orders.merge(sales, 
                                           left_on=["seller_active_quarter", "seller_id"], 
                                           right_on=["order_delivered_customer_yearquarter", "seller_id"], 
                                           how="left")\
                                            .drop(columns="order_delivered_customer_yearquarter")

# Fill missing sales with 0 (for sellers with no sales in a quarter)
churn_with_sales["sales"] = churn_with_sales["sales"].fillna(0)
churn_with_sales

Unnamed: 0,seller_active_quarter,observation_quarter,seller_id,is_churn,n_orders,sales
0,2017Q3,2017Q4,seller_513,False,65,4666.70
1,2017Q2,2017Q3,seller_471,False,12,1083.86
2,2018Q1,2018Q2,seller_1824,False,5,1006.00
3,2018Q3,2018Q4,seller_2023,,3,114.77
4,2017Q1,2017Q2,seller_1597,False,5,342.80
...,...,...,...,...,...,...
8569,2017Q2,2017Q3,seller_687,True,1,25.99
8570,2017Q2,2017Q3,seller_1446,False,1,695.90
8571,2018Q2,2018Q3,seller_2082,,1,95.00
8572,2017Q1,2017Q2,seller_1526,False,1,0.00


### **3.1.3 Example 3: Number of Late Orders per Quarter**
Now, we suppose we want to calculate another metric: the number of orders delivered late to customers in a given quarter. 

This metric is slightly more complex than the previous ones because a single order may include multiple items, but only one `order_delivered_customer_date` for a seller, which may lead to duplicate `is_late_to_customer` entries in `sellers_df`. 

Below is an example of an order with multiple items to show duplicate `is_late_to_customer` entries.

In [24]:
seller_df.query("order_id == 'order_id_58847'")[["seller_id", "order_id", "order_item_id", "product_id", "price", "order_delivered_customer_date", "is_late_to_customer"]]

Unnamed: 0,seller_id,order_id,order_item_id,product_id,price,order_delivered_customer_date,is_late_to_customer
106771,seller_2215,order_id_58847,1,product_id_8946,34.0,2017-08-10 15:53:17,False
106772,seller_2215,order_id_58847,2,product_id_23717,59.9,2017-08-10 15:53:17,False
106773,seller_2215,order_id_58847,3,product_id_16346,69.9,2017-08-10 15:53:17,False


In [25]:
print(f"Number of orders late to customer with duplicates: {seller_df[["order_id", "seller_id", "product_id", "is_late_to_customer"]]["is_late_to_customer"].sum()}")
print(f"Number of orders late to customer without duplicates: {seller_df[["order_id", "seller_id", "product_id", "is_late_to_customer"]].drop_duplicates()["is_late_to_customer"].sum()}")

Number of orders late to customer with duplicates: 8712
Number of orders late to customer without duplicates: 7995


If an order with multiple items is late, the `is_late_to_customer` flag will be repeated for each item, which could inflate the count of late orders if not handled properly.
To address this, we must:

1. Remove duplicates at the order level before counting late orders.

2. Use the quarter of delivery (`order_delivered_customer_yearquarter`) for aggregation, as lateness to customer is determined by the delivery date.

And so, we can adjust our grouping method in this way:

In [26]:
# Aggregate the number of unique late orders per seller per quarter of delivery
# Use drop_duplicates() to avoid counting the same late order multiple times due to multiple items
orders_late_to_customer = seller_df.groupby(["order_delivered_customer_yearquarter", "seller_id"])[["order_id", "is_late_to_customer"]]\
    .apply(lambda x: x.drop_duplicates()["is_late_to_customer"]\
    .sum())\
    .reset_index(name="n_orders_late_to_customer")

orders_late_to_customer.head()

Unnamed: 0,order_delivered_customer_yearquarter,seller_id,n_orders_late_to_customer
0,2016Q4,seller_1023,0
1,2016Q4,seller_1030,0
2,2016Q4,seller_115,0
3,2016Q4,seller_1162,0
4,2016Q4,seller_1191,0


As with the previous metrics, we merge the number of late orders into our churn dataset using a **left join**. This ensures all sellers in the churn dataset are retained, with the number of late orders set to 0 for quarters where no late orders were recorded.


In [27]:
# Merge the late orders metric into the churn dataset
# Left join ensures all sellers in churn_with_sales are kept, even if they have no late orders
churn_with_orders_late_to_customers = churn_with_sales.merge(orders_late_to_customer,
                                                             left_on=["seller_active_quarter", "seller_id"], 
                                                             right_on=["order_delivered_customer_yearquarter", "seller_id"], 
                                                             how="left")\
                                                             .drop(columns="order_delivered_customer_yearquarter")

# Fill missing late order counts with 0 (for sellers with no late orders in a quarter) and convert to integer
churn_with_orders_late_to_customers["n_orders_late_to_customer"] = churn_with_orders_late_to_customers["n_orders_late_to_customer"].fillna(0).astype('int64')
churn_with_orders_late_to_customers

Unnamed: 0,seller_active_quarter,observation_quarter,seller_id,is_churn,n_orders,sales,n_orders_late_to_customer
0,2017Q3,2017Q4,seller_513,False,65,4666.70,2
1,2017Q2,2017Q3,seller_471,False,12,1083.86,2
2,2018Q1,2018Q2,seller_1824,False,5,1006.00,0
3,2018Q3,2018Q4,seller_2023,,3,114.77,0
4,2017Q1,2017Q2,seller_1597,False,5,342.80,0
...,...,...,...,...,...,...,...
8569,2017Q2,2017Q3,seller_687,True,1,25.99,0
8570,2017Q2,2017Q3,seller_1446,False,1,695.90,1
8571,2018Q2,2018Q3,seller_2082,,1,95.00,0
8572,2017Q1,2017Q2,seller_1526,False,1,0.00,0


## **3.2 Generalizing Metric Calculations with Functions**
To make our workflow more efficient and reusable, we can generalize the steps for calculating and merging quarterly metrics into the churn dataset. Below, we define two functions:

1. **`merge_quarterly_metrics`**: A general function to calculate and merge metrics like the number of orders or total sales, where aggregation does not require handling duplicates (e.g., summing prices or counting unique orders).

2. **`merge_late_orders`**: A specialized function to calculate and merge metrics like the number of late orders, where duplicates at the order level must be removed before aggregation.

These functions will streamline the process of adding new metrics in the future and reduce repetitive code.

In [28]:
def merge_quarterly_metrics(churn_df, seller_df, date_col, metrics, how='left'):
    """
    Helper function to aggregate metrics and merge with churn_df
    
    Parameters:
    -----------
    churn_df: DataFrame with churn data
    seller_df: DataFrame with seller order data
    date_col: Column name containing the date to group by
    metrics: List of tuples (output_name, column_name, aggregation_function)
    how: Merge method, default 'left'
    
    Returns:
    --------
    Updated churn_df with new metrics
    """

    # Create a yearquarter column name from the date column
    yearquarter_col = f"{date_col}_yearquarter"
    
    # Build the aggregation dictionary
    agg_dict = {}
    for output_name, col_name, agg_func in metrics:
        agg_dict[output_name] = (col_name, agg_func)
    
    # Aggregate data
    agg_df = seller_df.groupby([yearquarter_col, 'seller_id']).agg(**agg_dict).reset_index()
    
    # Merge with churn_df
    churn_df = churn_df.merge(agg_df, 
                             left_on=['seller_active_quarter', 'seller_id'], 
                             right_on=[yearquarter_col, 'seller_id'], 
                             how=how)
    
    # Drop the redundant yearquarter column
    churn_df.drop(columns=[yearquarter_col], inplace=True)
    
    # Fill NaN values with 0 for count metrics and convert to int where appropriate
    for output_name, _, agg_func in metrics:
        if agg_func in ['nunique', 'count', 'sum', 'median'] and output_name != 'median_approve_time':
            churn_df.loc[churn_df[output_name].isna(), output_name] = 0
            if output_name not in ['sales', 'median_approve_time', 'median_review_score']:
                churn_df[output_name] = churn_df[output_name].astype('int64')
    
    return churn_df

In [29]:
# Handle late orders (special case) with a helper function
def merge_late_orders(churn_df, seller_df, date_col, late_flag_col, output_name):
    """
    Helper function to aggregate late order metrics
    """
    yearquarter_col = f"{date_col}_yearquarter"
    
    # Aggregate late orders counts
    late_orders = (seller_df.groupby([yearquarter_col, 'seller_id'])[['order_id', late_flag_col]]
                   .apply(lambda x: x.drop_duplicates()[late_flag_col].sum()) # Note: late order metrics can be larger than delivered metrics if we don't use x.drop_duplicates().
                   .reset_index(name=output_name))                            # This is because it's possible to count duplicate entries for some orders that have multiple items.
    
    # Merge with churn_df
    churn_df = churn_df.merge(late_orders, 
                             left_on=['seller_active_quarter', 'seller_id'], 
                             right_on=[yearquarter_col, 'seller_id'], 
                             how='left')
    
    # Drop the redundant yearquarter column
    churn_df.drop(columns=[yearquarter_col], inplace=True)
    
    # Fill NaN values with 0 and convert to int
    churn_df.loc[churn_df[output_name].isna(), output_name] = 0
    churn_df[output_name] = churn_df[output_name].astype('int64')
    
    return churn_df

## **3.3 Feature Metric Calculation**
### **3.3.1 Seller Order Metrics**

Now that we have defined the helper functions `merge_quarterly_metrics` and `merge_late_orders`, we can use them to calculate a variety of quarterly metrics for our churn dataset.

Below is a list of some of the metrics we will calculate, along with their definitions:

- **`n_orders`**: Number of unique orders placed in a quarter.

- **`n_approved_orders`**: Number of unique orders approved in a quarter.

- **`n_delivered_carrier`**: Number of unique orders delivered to carriers in a quarter.

- **`n_orders_late_to_carrier`**: Number of unique orders delivered late to carriers in a quarter.

- **`n_delivered_customers`**: Number of unique orders delivered to customers in a quarter.

- **`sales`**: Total revenue generated by a seller in a quarter (before commission cuts).

- **`n_orders_late_to_customer`**: Number of unique orders delivered late to customers in a quarter.

- **`median_approve_time`**: Median time taken by a seller to approve a purchase in a quarter.

- **`median_review_score`**: Median review score received by a seller in a quarter (on a scale of 0 to 5).

These metrics will be added to the churn dataset using the appropriate function, depending on whether late order metrics need to be handled.


In [30]:
# Orders per quarter
churn_df = merge_quarterly_metrics(
    churn_df, 
    seller_df, 
    'order_purchase', 
    [('n_orders', 'order_id', 'nunique')]
)

# Approved orders per quarter
churn_df = merge_quarterly_metrics(
    churn_df, 
    seller_df, 
    'order_approved_at', 
    [('n_approved_orders', 'order_id', 'nunique')]
)

# Delivered to carrier per quarter
churn_df = merge_quarterly_metrics(
    churn_df, 
    seller_df, 
    'order_delivered_carrier', 
    [('n_delivered_carrier', 'order_id', 'nunique')]
)

# Late orders to carrier
churn_df = merge_late_orders(
    churn_df, 
    seller_df, 
    'order_delivered_carrier', 
    'is_late_to_carrier', 
    'n_orders_late_to_carrier'
)

# Sales and deliveries to customers per quarter
churn_df = merge_quarterly_metrics(
    churn_df, 
    seller_df, 
    'order_delivered_customer', 
    [
        ('n_delivered_customers', 'order_id', 'nunique'),
        ('sales', 'price', 'sum')
    ]
)

# Late orders to customer
churn_df = merge_late_orders(
    churn_df, 
    seller_df, 
    'order_delivered_customer', 
    'is_late_to_customer', 
    'n_orders_late_to_customer'
)

#  Median approval time
#  Note: This case actually has to be handled like n_orders_late_to_customer because of duplicates. Unfortunately, we defined merge_late_orders is only for orders.
#  To avoid confusion from using merge_late_orders for approval time,
#  we will use merge_quarterly_metrics, dropping the duplicates before aggregating and merging.
churn_df = merge_quarterly_metrics(
    churn_df, 
    seller_df.drop_duplicates(['seller_id', 'order_id', 'order_approved_at_yearquarter']),   # Possible to count duplicate entries for some orders that have multiple items.
    'order_approved_at',                                                                     # Note that all items in the same order have the same approval time
    [('median_approve_time', 'time_to_approved', 'median')]
)

# Median rating
churn_df = merge_quarterly_metrics(
    churn_df, 
    seller_df, 
    'order_delivered_customer', 
    [('median_review_score', 'review_score', 'median')] 
)


churn_df

Unnamed: 0,seller_active_quarter,observation_quarter,seller_id,is_churn,n_orders,n_approved_orders,n_delivered_carrier,n_orders_late_to_carrier,n_delivered_customers,sales,n_orders_late_to_customer,median_approve_time,median_review_score
0,2017Q3,2017Q4,seller_513,False,65,65,60,4,53,4666.70,2,0 days 00:17:02,5.0
1,2017Q2,2017Q3,seller_471,False,12,12,12,5,12,1083.86,2,0 days 00:15:06.500000,3.0
2,2018Q1,2018Q2,seller_1824,False,5,5,5,0,4,1006.00,0,0 days 00:20:49,4.5
3,2018Q3,2018Q4,seller_2023,,3,3,3,0,4,114.77,0,0 days 00:10:28,4.5
4,2017Q1,2017Q2,seller_1597,False,5,5,5,3,4,342.80,0,0 days 00:12:22,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8569,2017Q2,2017Q3,seller_687,True,1,1,1,0,1,25.99,0,0 days 00:10:16,4.0
8570,2017Q2,2017Q3,seller_1446,False,1,1,1,0,2,695.90,1,0 days 01:07:42,3.0
8571,2018Q2,2018Q3,seller_2082,,1,1,1,0,1,95.00,0,0 days 17:32:34,4.0
8572,2017Q1,2017Q2,seller_1526,False,1,1,0,0,0,0.00,0,0 days 00:15:28,0.0


### **3.3.2 Seller Activity Metrics**

In addition to order-related metrics, we need to calculate metrics that capture a seller's activity patterns over time. These metrics provide insights into a seller’s engagement and longevity, which are critical for understanding churn behavior. Below are the activity metrics we will calculate:

- **`tenure`**: The total number of months a seller has been active up to a given quarter. This measures the seller’s overall experience on the platform.

- **`n_months_active_quarter`**: The number of months a seller was active within a specific quarter (values range from 1 to 3). This indicates the seller’s activity intensity during the quarter.

- **`last_month_active_quarter`**: The last month within a quarter in which a seller was active (values range from 1 to 3, where 1 is the first month of the quarter, 2 is the second, and 3 is the third). This helps measure recency of activity within the quarter.

To calculate these metrics efficiently, we will again define two functions:

1. **`calculate_quarterly_tenure`**: Computes the tenure of each seller at the end of each quarter.

2. **`calculate_quarterly_months_active`**: Computes the number of active months and the last active month within each quarter.


In [31]:
# Calculate the tenure months for each seller at the end of each quarter
def calculate_quarterly_tenure(seller_df, churn_df):
    """
    Calculate seller tenure in months at the end of each quarter
    
    Parameters:
    -----------
    seller_df: DataFrame with seller activity data
    churn_df: DataFrame with seller churn data by quarter
    
    Returns:
    --------
    Updated churn_df with tenure_months added
    """

    # Extract month from order_approved_at and get unique seller-month combinations
    monthly = seller_df[['seller_id', 'order_approved_at', 'order_approved_at_yearquarter']].copy()
    monthly['order_approved_at_yearmonth'] = monthly['order_approved_at'].dt.strftime('%Y-%m')
    
    # Get unique combinations and calculate tenure
    monthly = monthly[['seller_id', 'order_approved_at_yearmonth', 'order_approved_at_yearquarter']].drop_duplicates()
    monthly = monthly.sort_values(['seller_id', 'order_approved_at_yearmonth'])
    monthly['tenure_months'] = monthly.groupby('seller_id').cumcount() + 1
    
    # Get max tenure per quarter for each seller
    # We take the max because for each quarter, monthly has multiple tenure_months for each yearmonth
    quarterly = monthly.groupby(['order_approved_at_yearquarter', 'seller_id'])['tenure_months'].max().reset_index()
    quarterly.rename(columns={'order_approved_at_yearquarter': 'seller_active_quarter'}, inplace=True)
    
    # Merge with churn_df
    return churn_df.merge(quarterly, on=['seller_active_quarter', 'seller_id'], how='left')

In [32]:
def calculate_quarterly_months_active(seller_df, churn_df):
    """
    Calculate the latest month active and the number of active months for a seller in each quarter
    
    Parameters:
    -----------
    seller_df: DataFrame with seller activity data
    churn_df: DataFrame with seller churn data by quarter
    
    Returns:
    --------
    Updated churn_df with last_month_active_quarter and n_active_months_quarter added
    """

    # Extract month from order_approved_at and get unique seller-month combinations
    monthly = seller_df[['seller_id', 'order_approved_at', 'order_approved_at_yearquarter']].copy()
    
    # Extract month active in a quarter for each order items: First month -> 1, Second month -> 2, Third month -> 3
    monthly['month_in_quarter'] = (monthly['order_approved_at'].dt.month - 1) % 3 + 1
            
    # Get max latest month active and the number of months active per quarter for each seller
    quarterly = monthly.groupby(['seller_id', 'order_approved_at_yearquarter']).agg(last_month_active_quarter=('month_in_quarter', 'max'),
                                                                    n_months_active_quarter=('month_in_quarter', pd.Series.nunique))\
                                                                    .reset_index()
    quarterly.rename(columns={'order_approved_at_yearquarter': 'seller_active_quarter'}, inplace=True)
    
    # Merge with churn_df
    return churn_df.merge(quarterly, on=['seller_active_quarter', 'seller_id'], how='left')

In [33]:
# Calculate tenure
churn_df = calculate_quarterly_tenure(seller_df, churn_df)

# Calculate last month active and number of months active in a quarter
churn_df = calculate_quarterly_months_active(seller_df, churn_df)

churn_df

Unnamed: 0,seller_active_quarter,observation_quarter,seller_id,is_churn,n_orders,n_approved_orders,n_delivered_carrier,n_orders_late_to_carrier,n_delivered_customers,sales,n_orders_late_to_customer,median_approve_time,median_review_score,tenure_months,last_month_active_quarter,n_months_active_quarter
0,2017Q3,2017Q4,seller_513,False,65,65,60,4,53,4666.70,2,0 days 00:17:02,5.0,7,3,3
1,2017Q2,2017Q3,seller_471,False,12,12,12,5,12,1083.86,2,0 days 00:15:06.500000,3.0,3,3,3
2,2018Q1,2018Q2,seller_1824,False,5,5,5,0,4,1006.00,0,0 days 00:20:49,4.5,5,3,2
3,2018Q3,2018Q4,seller_2023,,3,3,3,0,4,114.77,0,0 days 00:10:28,4.5,6,2,1
4,2017Q1,2017Q2,seller_1597,False,5,5,5,3,4,342.80,0,0 days 00:12:22,3.5,4,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8569,2017Q2,2017Q3,seller_687,True,1,1,1,0,1,25.99,0,0 days 00:10:16,4.0,2,2,1
8570,2017Q2,2017Q3,seller_1446,False,1,1,1,0,2,695.90,1,0 days 01:07:42,3.0,3,3,1
8571,2018Q2,2018Q3,seller_2082,,1,1,1,0,1,95.00,0,0 days 17:32:34,4.0,3,2,1
8572,2017Q1,2017Q2,seller_1526,False,1,1,0,0,0,0.00,0,0 days 00:15:28,0.0,1,3,1


### **3.3.3 Finalizing Churn Dataset**

In [34]:
# Add geographic data
churn_df = churn_df.merge(seller_df[['seller_id', 'seller_city', 'seller_state']].drop_duplicates(), on='seller_id', how='left')

# Convert median_approve_time to minutes
churn_df['median_approve_time'] = churn_df['median_approve_time'].dt.total_seconds() / 60.0

`observation_quarter` and `seller_active_quarter` columns serve similar purposes, but we'll keep `seller_active_quarter` for specific reasons. **`seller_active_quarter` will be used as an index for out-of-time validation during modeling**. 
Out-of-time validation ensures that our model is tested on data from a different time period than the training data, simulating real-world prediction scenarios (see the modeling notebook for more details).

While using `seller_active_quarter` as an index, we'll also extract the `year` and `quarter` as numerical features from `seller_active_quarter` to capture seasonality patterns and long-term trends that might influence seller churn behavior. 

And so, our finalized churn dataset is as the following:

In [35]:
churn_df = churn_df.drop(columns='observation_quarter')
churn_df.insert(loc=1, column='quarter', value=churn_df['seller_active_quarter'].dt.quarter)
churn_df.insert(loc=1, column='year', value=churn_df['seller_active_quarter'].dt.year)
churn_df

Unnamed: 0,seller_active_quarter,year,quarter,seller_id,is_churn,n_orders,n_approved_orders,n_delivered_carrier,n_orders_late_to_carrier,n_delivered_customers,sales,n_orders_late_to_customer,median_approve_time,median_review_score,tenure_months,last_month_active_quarter,n_months_active_quarter,seller_city,seller_state
0,2017Q3,2017,3,seller_513,False,65,65,60,4,53,4666.70,2,17.033333,5.0,7,3,3,volta redonda,SP
1,2017Q2,2017,2,seller_471,False,12,12,12,5,12,1083.86,2,15.108333,3.0,3,3,3,sao paulo,SP
2,2018Q1,2018,1,seller_1824,False,5,5,5,0,4,1006.00,0,20.816667,4.5,5,3,2,borda da mata,MG
3,2018Q3,2018,3,seller_2023,,3,3,3,0,4,114.77,0,10.466667,4.5,6,2,1,franca,SP
4,2017Q1,2017,1,seller_1597,False,5,5,5,3,4,342.80,0,12.366667,3.5,4,3,3,loanda,PR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8569,2017Q2,2017,2,seller_687,True,1,1,1,0,1,25.99,0,10.266667,4.0,2,2,1,sao paulo,SP
8570,2017Q2,2017,2,seller_1446,False,1,1,1,0,2,695.90,1,67.700000,3.0,3,3,1,cachoeirinha,RS
8571,2018Q2,2018,2,seller_2082,,1,1,1,0,1,95.00,0,1052.566667,4.0,3,2,1,porto alegre,RS
8572,2017Q1,2017,1,seller_1526,False,1,1,0,0,0,0.00,0,15.466667,0.0,1,3,1,sao paulo,SP


In [36]:
pd.DataFrame({
    'total': churn_df.shape[0],
    'null_count': churn_df.isna().sum(),
    'null_pct':   [f"{val:.2f}%" for val in (churn_df.isna().sum() / churn_df.shape[0] * 100)],
    'unique_count': churn_df.nunique(),
    'dtype': churn_df.dtypes
}).rename_axis('column')

Unnamed: 0_level_0,total,null_count,null_pct,unique_count,dtype
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
seller_active_quarter,8574,0,0.00%,8,period[Q-DEC]
year,8574,0,0.00%,3,int64
quarter,8574,0,0.00%,4,int64
seller_id,8574,0,0.00%,3095,object
is_churn,8574,3306,38.56%,2,object
n_orders,8574,0,0.00%,189,int64
n_approved_orders,8574,0,0.00%,192,int64
n_delivered_carrier,8574,0,0.00%,187,int64
n_orders_late_to_carrier,8574,0,0.00%,42,int64
n_delivered_customers,8574,0,0.00%,186,int64


# **4. Exporting Churn Dataset for Training, Testing, and Prediction**

In [37]:
churn_df.groupby("seller_active_quarter").agg(n_active_sellers=('seller_id', pd.Series.count),
                                                  churn_rate=('is_churn', 'mean'))

Unnamed: 0_level_0,n_active_sellers,churn_rate
seller_active_quarter,Unnamed: 1_level_1,Unnamed: 2_level_1
2016Q4,144,0.263889
2017Q1,657,0.25723
2017Q2,810,0.244444
2017Q3,1010,0.216832
2017Q4,1266,0.243286
2018Q1,1381,0.241854
2018Q2,1678,
2018Q3,1628,


**Note**: The dataset for `2018Q3` is incomplete as it terminates on `2018-09`. Therefore, `2018Q3` cannot be used for prediction purposes. Additionally, it is impossible to get the proper churn labels for `2018Q2` by this limitation, but as the data for `2018Q2` is complete, we can use it for prediction. Our analysis will focus on the period from `2017Q1` to `2018Q1` for training and testing the model, which will then be used to predict churn in the `2018Q2` period. 2016 data is not included due to the scarcity of sellers as olist has only started operations in 2015 and the business has yet to have many sellers.

In [38]:
seller_df.set_index("order_approved_at").resample("MS").agg(n_active_sellers=("seller_id", "nunique")).sort_index()[-9:]

Unnamed: 0_level_0,n_active_sellers
order_approved_at,Unnamed: 1_level_1
2018-01-01,969
2018-02-01,948
2018-03-01,1001
2018-04-01,1111
2018-05-01,1126
2018-06-01,1177
2018-07-01,1251
2018-08-01,1286
2018-09-01,1


In [39]:
churn_df_train_test = churn_df.query("seller_active_quarter <= '2018Q1' and seller_active_quarter >= '2017Q1'")
churn_df_train_test

Unnamed: 0,seller_active_quarter,year,quarter,seller_id,is_churn,n_orders,n_approved_orders,n_delivered_carrier,n_orders_late_to_carrier,n_delivered_customers,sales,n_orders_late_to_customer,median_approve_time,median_review_score,tenure_months,last_month_active_quarter,n_months_active_quarter,seller_city,seller_state
0,2017Q3,2017,3,seller_513,False,65,65,60,4,53,4666.70,2,17.033333,5.0,7,3,3,volta redonda,SP
1,2017Q2,2017,2,seller_471,False,12,12,12,5,12,1083.86,2,15.108333,3.0,3,3,3,sao paulo,SP
2,2018Q1,2018,1,seller_1824,False,5,5,5,0,4,1006.00,0,20.816667,4.5,5,3,2,borda da mata,MG
4,2017Q1,2017,1,seller_1597,False,5,5,5,3,4,342.80,0,12.366667,3.5,4,3,3,loanda,PR
5,2017Q2,2017,2,seller_659,False,8,8,8,0,6,245.40,0,1241.908333,5.0,2,3,2,ribeirao preto,SP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8562,2017Q3,2017,3,seller_2972,False,1,1,1,0,1,469.00,0,14.783333,5.0,4,2,1,joinville,SC
8566,2017Q3,2017,3,seller_192,False,1,1,1,0,0,0.00,0,16.983333,0.0,1,3,1,mirandopolis,SP
8569,2017Q2,2017,2,seller_687,True,1,1,1,0,1,25.99,0,10.266667,4.0,2,2,1,sao paulo,SP
8570,2017Q2,2017,2,seller_1446,False,1,1,1,0,2,695.90,1,67.700000,3.0,3,3,1,cachoeirinha,RS


In [40]:
pd.DataFrame({
    'total': churn_df_train_test.shape[0],
    'null_count': churn_df_train_test.isna().sum(),
    'null_pct':   [f"{val:.2f}%" for val in (churn_df_train_test.isna().sum() / churn_df_train_test.shape[0] * 100)],
    'unique_count': churn_df_train_test.nunique(),
    'dtype': churn_df_train_test.dtypes
}).rename_axis('column')

Unnamed: 0_level_0,total,null_count,null_pct,unique_count,dtype
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
seller_active_quarter,5124,0,0.00%,5,period[Q-DEC]
year,5124,0,0.00%,2,int64
quarter,5124,0,0.00%,4,int64
seller_id,5124,0,0.00%,2158,object
is_churn,5124,0,0.00%,2,object
n_orders,5124,0,0.00%,167,int64
n_approved_orders,5124,0,0.00%,169,int64
n_delivered_carrier,5124,0,0.00%,163,int64
n_orders_late_to_carrier,5124,0,0.00%,37,int64
n_delivered_customers,5124,0,0.00%,157,int64


In [41]:
# Get absolute path for cross-platform compatibility
base_dir = os.path.abspath(os.getcwd())  # Ensures script works regardless of where it's run

# Define dataset folder path
folder_path = os.path.join(base_dir, "dataset", "03_ML_dataset")

# Create folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Define file paths
churn_df_train_test_path = os.path.join(folder_path, "churn_dataset.csv")

churn_df_train_test.to_csv(churn_df_train_test_path, index=False)
print("✅ CSV files saved successfully in:", folder_path)

✅ CSV files saved successfully in: /Users/glen/Documents/04. Final Project/00. Final/dataset/03_ML_dataset


In [42]:
churn_df_predict = churn_df.query("seller_active_quarter <= '2018Q2' and seller_active_quarter >= '2018Q2'")
churn_df_predict

Unnamed: 0,seller_active_quarter,year,quarter,seller_id,is_churn,n_orders,n_approved_orders,n_delivered_carrier,n_orders_late_to_carrier,n_delivered_customers,sales,n_orders_late_to_customer,median_approve_time,median_review_score,tenure_months,last_month_active_quarter,n_months_active_quarter,seller_city,seller_state
15,2018Q2,2018,2,seller_1521,,21,21,20,2,21,3545.25,0,32.583333,4.0,8,3,3,guarulhos,SP
16,2018Q2,2018,2,seller_998,,57,57,59,2,58,13033.60,6,21.766667,5.0,16,3,3,campinas,SP
27,2018Q2,2018,2,seller_1781,,143,143,142,0,140,5994.07,14,42.566667,5.0,18,3,3,atibaia,SP
30,2018Q2,2018,2,seller_1548,,58,59,62,0,78,11719.90,5,31.900000,5.0,19,3,3,curitiba,PR
47,2018Q2,2018,2,seller_1042,,70,71,71,0,69,7009.00,2,29.350000,5.0,7,3,3,belo horizonte,MG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8528,2018Q2,2018,2,seller_2792,,1,1,1,0,1,189.00,0,29.366667,5.0,1,3,1,pedreira,SP
8548,2018Q2,2018,2,seller_3074,,1,1,1,1,1,94.68,0,14.866667,5.0,2,1,1,goiania,GO
8568,2018Q2,2018,2,seller_2507,,1,1,1,0,1,55.80,0,22.883333,5.0,1,1,1,sao paulo,SP
8571,2018Q2,2018,2,seller_2082,,1,1,1,0,1,95.00,0,1052.566667,4.0,3,2,1,porto alegre,RS


In [43]:
# Get absolute path for cross-platform compatibility
base_dir = os.path.abspath(os.getcwd())  # Ensures script works regardless of where it's run

# Define dataset folder path
folder_path = os.path.join(base_dir, "dataset", "03_ML_dataset")

# Create folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Define file paths
churn_df_predict_path = os.path.join(folder_path, "churn_dataset_predict.csv")

churn_df_predict.to_csv(churn_df_predict_path, index=False)