In [1]:
import pandas as pd
import numpy as np

In [2]:
def boot_sample(your_data, unit_id, time, random_seed=None):
    """
    Draws units with replacement from all units in your_data.
    Returns a new bootstrap panel dataset with unique IDs for these units.

    Parameters:
    - your_data (DataFrame): Your panel data with specified columns.
    - unit_id (str): Column name for unit IDs.
    - time (str): Column name for time.
    - random_seed (int, optional): Seed for random number generation.
    
    Returns:
    - DataFrame: New data with bootstrapped samples and unique unit IDs.
    """
    if random_seed is not None:
        np.random.seed(random_seed)
    
    if unit_id not in your_data.columns or time not in your_data.columns:
        raise ValueError("Specified unit_id or time column not found in the data.")
    
    # Get the unique units
    IDs = pd.DataFrame({'ID': np.unique(your_data[unit_id])})
    
    # The number of unique units
    N = IDs.shape[0]
    
    # Sample with replacement
    index = np.random.randint(0, N, size=N)
    bs_ID = IDs.iloc[index]
    
    # Add a column with the bootstrap sample number
    bs_ID['bs'] = np.arange(N)
    
    # Full bootstrap sample
    bs_data = your_data.merge(bs_ID, how='inner', left_on=unit_id, right_on='ID')
    
    # Make 'bs' the new unit_id
    bs_data[unit_id] = bs_data['bs']
    
    # Drop unnecessary columns and sort by new unit_id and time
    bs_data = bs_data.drop(['bs', 'ID'], axis=1).sort_values([unit_id, time])
    
    return bs_data

In [3]:
# Assuming your panel data is in the DataFrame 'your_data'
# And you want to draw 100 treated units
your_data = pd.DataFrame({
    'unit_id': [1, 1, 2, 2, 3, 3, 4, 4], 
    'time': [1, 2, 1, 2, 1, 2, 1, 2],
    'treated': [1, 1, 0, 0, 1, 1, 0, 0],
    'outcome': [1, 2, 3, 4, 5, 6, 7, 8],
})

In [4]:
your_data

Unnamed: 0,unit_id,time,treated,outcome
0,1,1,1,1
1,1,2,1,2
2,2,1,0,3
3,2,2,0,4
4,3,1,1,5
5,3,2,1,6
6,4,1,0,7
7,4,2,0,8


In [5]:
new_data = boot_sample(your_data, 'unit_id', 'time')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bs_ID['bs'] = np.arange(N)


In [6]:
new_data

Unnamed: 0,unit_id,time,treated,outcome
2,0,1,1,5
4,0,2,1,6
0,1,1,1,1
1,1,2,1,2
3,2,1,1,5
5,2,2,1,6
6,3,1,0,7
7,3,2,0,8
