I want you to help me to transform a dataframe to show process state evolution though time. The context is the following: the dataframe has 3 columns: 1. id_exp: it is the process id; 2.num_tramite: state id; 3.fecha: date of state change. The final dataframe should contain, for each date (and there must be values for every day between the first and last date of the dataframe), the number of processes on each state, as the final objective is to get an area plot showing the evolution of state change over time. Each process can be in one state at a time. So process starts in one state, stays some time in that state, then changes to other state, until it stops in a final state.  Please analyze the problem step by step, consider the best way to achieve it in an efficient way (computationally), and explain each step of the calculation. 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df_tramitesX = pd.read_parquet('./data/tramita/tramites_autoconsumo.parquet')

In [4]:
df_tramitesX

Unnamed: 0,id_exp,dni,nif,fecha_alta_exp,fecha_registro_exp,codine_provincia,codine_municipio,codine,municipio,provincia,...,siaci,consejeria_proc,org_instructor_proc,es_telematica,desc_tramite,fecha_tramite,num_tramite,cod_procedimiento,orden_tramite,es_inicial
45875,1127895,1787973,,2022-02-09 09:07:17,2021-12-23,13,064,13064,Poblete,Ciudad Real,...,SLPT,Consejeria de desarrollo sostenible,Direccion general de transicion energetica,True,Registro de solicitud,2021-12-23 00:00:00,0,884.0,0,False
45876,1127895,1787973,,2022-02-09 09:07:17,2021-12-23,13,064,13064,Poblete,Ciudad Real,...,SLPT,Consejeria de desarrollo sostenible,Direccion general de transicion energetica,True,1-1-Presentación de la solicitud,2022-02-09 09:07:17,1,884.0,1,True
45877,1127895,1787973,,2022-02-09 09:07:17,2021-12-23,13,064,13064,Poblete,Ciudad Real,...,SLPT,Consejeria de desarrollo sostenible,Direccion general de transicion energetica,True,10-1-Requerimiento documentación,2022-10-14 09:44:23,10,884.0,2,False
45878,1127895,1787973,,2022-02-09 09:07:17,2021-12-23,13,064,13064,Poblete,Ciudad Real,...,SLPT,Consejeria de desarrollo sostenible,Direccion general de transicion energetica,True,30-1-Evaluación de Solicitud,2022-10-27 08:04:51,30,884.0,3,False
45879,1127895,1787973,,2022-02-09 09:07:17,2021-12-23,13,064,13064,Poblete,Ciudad Real,...,SLPT,Consejeria de desarrollo sostenible,Direccion general de transicion energetica,True,42-1-Propuesta Resolución Estimatoria de Conce...,2022-10-27 08:14:55,42,884.0,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124961,1664841,1696284,,2023-11-03 09:13:13,2023-11-03,13,005,13005,Alcázar de San Juan,Ciudad Real,...,SLPT,Consejeria de desarrollo sostenible,Direccion general de transicion energetica,True,1-1-Presentación de la solicitud,2023-11-03 09:13:13,1,884.0,1,True
9125278,1665715,2096674,,2023-11-06 08:44:05,2023-11-03,13,053,13053,Manzanares,Ciudad Real,...,SLPT,Consejeria de desarrollo sostenible,Direccion general de transicion energetica,True,Registro de solicitud,2023-11-03 00:00:00,0,884.0,0,False
9125279,1665715,2096674,,2023-11-06 08:44:05,2023-11-03,13,053,13053,Manzanares,Ciudad Real,...,SLPT,Consejeria de desarrollo sostenible,Direccion general de transicion energetica,True,1-1-Presentación de la solicitud,2023-11-06 08:44:05,1,884.0,1,True
9125280,1665723,2096678,,2023-11-06 08:44:57,2023-11-03,19,024,19024,Alovera,Guadalajara,...,SLPT,Consejeria de desarrollo sostenible,Direccion general de transicion energetica,True,Registro de solicitud,2023-11-03 00:00:00,0,884.0,0,False


In [None]:
columns_estados = ['id_exp','num_tramite','fecha_tramite']
df_tramitesX_estados = df_tramitesX[columns_estados]

In [None]:
df_tramitesX_estados = df_tramitesX_estados.assign(fecha=df_tramitesX_estados['fecha_tramite'].dt.date)
df_tramitesX_estados.drop('fecha_tramite', axis = 1, inplace = True)

In [None]:
df_tramitesX_estados.head()

# Número de procesos por estado/día con date_range

In [None]:
df = df_tramitesX_estados

In [None]:
# Step 1: Sort DataFrame
df = df.sort_values(by=["id_exp", "fecha"])

In [None]:
# Step 2: Calculate End Date
df["end_date"] = df.groupby("id_exp")["fecha"].shift(-1)
# shift mueve una fila hacia arriba dentro de cada grupo
df.head()

## qué tramites son finales, y cuantos de cada tipo

In [None]:
# qué tramites son último estado
tramites_finales_num = df[df['end_date'].isnull()].groupby('num_tramite').size()
tramites_finales_num

In [None]:
type(tramites_finales_num)

In [None]:
df_lista_tramitesfinales = pd.DataFrame(tramites_finales_num).reset_index()
df_lista_tramitesfinales.columns = ['num_tramite','count']
df_lista_tramitesfinales.head()

In [None]:
def_tramites = pd.read_parquet('./data/tramita/def_proc_tramite_unicos.parquet')
def_tramites_autoconsumo = def_tramites[def_tramites['cod_proc']==884]
def_tramites_autoconsumo

In [None]:
df_lectura = pd.merge(df_lista_tramitesfinales, def_tramites_autoconsumo[['num_tramite','denom']],
                      left_on = 'num_tramite',
                      right_on = 'num_tramite',
                      how = 'inner')
df_lectura

## quitar duplicados

In [None]:
duplicates = df[df.duplicated(subset=['id_exp', 'fecha', 'num_tramite'], keep=False)]
print(duplicates)

In [None]:
df = df.drop_duplicates(
    subset=['id_exp', 'fecha', 'num_tramite'], 
    keep='last'
)

## expandir fechas

In [None]:
# Step 3: Expand Dates
max_date = df['fecha'].max()
expanded_rows = []
for _, row in df.iterrows():
    # para cada estado de cada expediente se genera una línea por cada día en el rango de fechas
    if pd.notnull(row["end_date"]):
        date_range = pd.date_range(start=row["fecha"], end=row["end_date"] - pd.Timedelta(days=1))
    else:
        date_range = pd.date_range(start=row["fecha"], end=max_date)  #  end date
    for date in date_range:
        expanded_rows.append((date, row["num_tramite"]))
expanded_df = pd.DataFrame(expanded_rows, columns=["fecha", "num_tramite"])
expanded_df.head()
# The underscore (_) in that line is a convention in Python used as a throwaway variable - it indicates that we don't care about or won't use that particular value.
# In df.iterrows(), the function returns two values for each row:

# The index number of the row
# The actual row data

# So when you write for _, row in df.iterrows():, you're saying "I want to work with the row data, 
# but I don't care about the index number." The underscore is just a placeholder for that index value we're choosing to ignore.

In [None]:
expanded_df[expanded_df['fecha']=='2021-12-22'].count()
# para una fecha debe salir el número de expedientes en cualquier estado ese día

In [None]:
expanded_df[(expanded_df['fecha']=='2021-12-22') 
    & (expanded_df['num_tramite']==0)].count()
# al ser el primer día del dataframe, todos son estado 0, presentación de solicitud

In [None]:
# Step 4: Aggregate by Date and State
aggregated_df = expanded_df.groupby(["fecha", "num_tramite"]).size().reset_index(name="count")
aggregated_df.head()

In [None]:
# Step 5: Pivot for Plotting
pivot_df = aggregated_df.pivot(index="fecha", columns="num_tramite", values="count").fillna(0)
pivot_df.tail()

In [None]:
pivot_df.info()

In [None]:
pivot_df.index
#979 fechas, desde 2021-12-22 hasta 2024-08-26

In [None]:
# Create a complete date range from min to max date
complete_dates = pd.date_range(start=pivot_df.index.min(), 
                             end=pivot_df.index.max(),
                             freq='D')

# Find missing dates by comparing with your index
missing_dates = complete_dates.difference(pivot_df.index)

# Check if there are any missing dates
if len(missing_dates) > 0:
    print(f"Found {len(missing_dates)} missing dates:")
    print(missing_dates)
else:
    print("No missing dates found")

In [None]:
# Step 6: Resample to include all dates and fill missing values with 0
complete_pivot_df = pivot_df.resample('D').asfreq().fillna(0)

In [None]:
complete_pivot_df.index

In [None]:
complete_pivot_df.to_parquet('./data/tramita/tramites_autoconsumo_ts_estados.parquet')

In [None]:
column_states = [0,100]
df_toplot = complete_pivot_df[column_states]

In [None]:
# Step 7: Plot
df_toplot.plot(kind="area", stacked=True, figsize=(12, 6))
plt.title("Process State Evolution Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Processes")
plt.legend(title="State ID")
plt.show()