In [1]:
import pandas as pd

In [53]:
def is_year_column(column_name):
    try:
        year = int(column_name)
        return len(column_name) == 4
    except ValueError:
        return False

def raw_dataframe_to_net_dataframe(path):
    df = pd.read_csv(f'../descriptive_analysis/{path}.csv').drop(columns='Unnamed: 0')
    
    # Find the first year column
    start_index = next(i for i, col in enumerate(df.columns) if is_year_column(col))
    
    # Calculate the differences relative to the previous year
    diff_prev_year = df.copy()
    diff_prev_year.iloc[:, start_index:] = df.iloc[:, start_index:].diff(axis=1)

    # Calculate the differences relative to the first year
    first_year = df.iloc[:, start_index].values.reshape(-1, 1)
    diff_first_year = df.copy()
    diff_first_year.iloc[:, start_index:] = df.iloc[:, start_index:].subtract(first_year, axis=0)

    # Transform the data for the desired format
    id_vars = df.columns[:start_index].tolist()  # Columns before the year columns
    df_melted_prev = diff_prev_year.melt(id_vars=id_vars, value_name="Net Change from Prev Year", var_name="Year")
    df_melted_first = diff_first_year.melt(id_vars=id_vars, value_name="Net Change from First Year", var_name="Year")

    # Merge the datasets
    df_merged = pd.merge(df_melted_prev, df_melted_first, on=id_vars + ["Year"])
    df_merged = df_merged.sort_values(by=id_vars + ['Year'])
    
    df_merged.to_csv(f'../descriptive_analysis_by_net_change/{path}.csv')


In [60]:
outcome_vars = ["elapass", "mathpass", "dropout"]
descriptive_groups = ["year", "black", "hispanic", "black_hispanic", "schooltype", "income"]

for outcome_var in outcome_vars:
    for descriptive_group in descriptive_groups:
        raw_dataframe_to_net_dataframe(f"{outcome_var}_by_{descriptive_group}")