In [1]:
import pandas as pd
import numpy as np

In [2]:
# Sample input dataframe
data = {
    'id_exp': [1, 1, 2, 2],
    'num_tramite': [1, 2, 1, 3],
    'fecha': ['2023-01-01', '2023-01-03', '2023-01-02', '2023-01-04']
}
df = pd.DataFrame(data)
df['fecha'] = pd.to_datetime(df['fecha'])

# Ensure the data is sorted by 'id_exp' and 'fecha'
df = df.sort_values(by=['id_exp', 'fecha'])

# Set MultiIndex
df = df.set_index(['id_exp', 'fecha'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,num_tramite
id_exp,fecha,Unnamed: 2_level_1
1,2023-01-01,1
1,2023-01-03,2
2,2023-01-02,1
2,2023-01-04,3


In [3]:

# Generate timeline for reindexing
timeline = pd.date_range(df.index.get_level_values('fecha').min(),
                         df.index.get_level_values('fecha').max())

# Create a MultiIndex with all combinations of id_exp and timeline
multi_index = pd.MultiIndex.from_product(
    [df.index.levels[0], timeline], names=['id_exp', 'fecha']
)
multi_index

MultiIndex([(1, '2023-01-01'),
            (1, '2023-01-02'),
            (1, '2023-01-03'),
            (1, '2023-01-04'),
            (2, '2023-01-01'),
            (2, '2023-01-02'),
            (2, '2023-01-03'),
            (2, '2023-01-04')],
           names=['id_exp', 'fecha'])

In [4]:

# Reindex to this complete MultiIndex
df_reindexed = df.reindex(multi_index)
df_reindexed

# 1. Aligning Data to a New Index

# reindex allows you to change the index or column labels of a DataFrame or Series to match a new index or set of labels. 
# This is particularly useful for aligning data across datasets.

# import pandas as pd

# data = {'A': [1, 2, 3]}
# df = pd.DataFrame(data, index=['a', 'b', 'c'])

# # Aligning to a new index
# new_index = ['a', 'b', 'd', 'e']
# df_reindexed = df.reindex(new_index)

# print(df_reindexed)

#      A
# a  1.0
# b  2.0
# d  NaN
# e  NaN


# 2. Interpolating Data Over a Time Index
# In time-series data, reindex is often used to fill gaps or align data to a regular frequency.
# import pandas as pd
# date_range = pd.date_range('2023-01-01', '2023-01-05')
# data = {'value': [10, 15, 20]}
# df = pd.DataFrame(data, index=pd.to_datetime(['2023-01-01', '2023-01-03', '2023-01-05']))

# # Reindex to a daily frequency
# df_reindexed = df.reindex(date_range).interpolate()

# print(df_reindexed)

#            value
# 2023-01-01   10.0
# 2023-01-02   12.5
# 2023-01-03   15.0
# 2023-01-04   17.5
# 2023-01-05   20.0

Unnamed: 0_level_0,Unnamed: 1_level_0,num_tramite
id_exp,fecha,Unnamed: 2_level_1
1,2023-01-01,1.0
1,2023-01-02,
1,2023-01-03,2.0
1,2023-01-04,
2,2023-01-01,
2,2023-01-02,1.0
2,2023-01-03,
2,2023-01-04,3.0


In [6]:

# Forward-fill num_tramite within each id_exp
df_reindexed['num_tramite'] = df_reindexed['num_tramite'].groupby(level=0).ffill()
#groupby(level=0) groups the data based on the first level of the MultiIndex, which is id_exp. 
#This means each process ID (id_exp) is treated as a separate group.
# The .ffill() method (short for "forward fill") propagates the last valid value forward within each group. 
# This ensures that each process (id_exp) retains its last known state (num_tramite) until a new state is encountered or the timeline ends.
df_reindexed


Unnamed: 0_level_0,Unnamed: 1_level_0,num_tramite
id_exp,fecha,Unnamed: 2_level_1
1,2023-01-01,1.0
1,2023-01-02,1.0
1,2023-01-03,2.0
1,2023-01-04,2.0
2,2023-01-01,
2,2023-01-02,1.0
2,2023-01-03,1.0
2,2023-01-04,3.0


In [8]:
# Reset the index to make it easier to work with
df_reset = df_reindexed.reset_index()

df_reset

Unnamed: 0,id_exp,fecha,num_tramite
0,1,2023-01-01,1.0
1,1,2023-01-02,1.0
2,1,2023-01-03,2.0
3,1,2023-01-04,2.0
4,2,2023-01-01,
5,2,2023-01-02,1.0
6,2,2023-01-03,1.0
7,2,2023-01-04,3.0


In [9]:
# Aggregate to count the number of processes per state per date
result = (
    df_reset.groupby(['fecha', 'num_tramite'])
    .size()
    .unstack(fill_value=0)
)

print(result)

num_tramite  1.0  2.0  3.0
fecha                     
2023-01-01     1    0    0
2023-01-02     2    0    0
2023-01-03     1    1    0
2023-01-04     0    1    1
