In [1]:
import pandas as pd

## pre-2002 waves

In [2]:
income_pre2002 = pd.read_csv('income_pre2002.csv')

In [3]:
income_list = [f"{year}" for year in range(1978, 1994)]
new_columns = ['code','birthyear'] + income_list + ['1995','1997','1999']
income_pre2002.columns = new_columns

In [4]:
# convert birthyear to 19xx
income_pre2002['birthyear'] += 1900

## 2002 - 2020 waves

In [5]:
def income_calc(year):
    file_name = f'income_{year}.csv'
    df = pd.read_csv(file_name)
    
    df[f'{year}'] = df.iloc[:, 1].apply(
        lambda x: x if x >= 0 else pd.NA)

    # If reported in range
    df[f'{year}'] = df.apply(
        lambda row: (row.iloc[2] + row.iloc[3]) / 2 
                    if row.iloc[2] > 0 and row.iloc[3] > 0 
                    else (row.iloc[2] 
                            if row.iloc[2] > 0 and row.iloc[3] < 0 
                            else (row.iloc[3] 
                                if row.iloc[2] < 0 and row.iloc[3] > 0 
                                 else row[f'{year}'])),
        axis=1
    )
    
    # if reported less than 15000, all take 7500 (avg 15000 and 0)
    df[f'{year}'] = df.apply(
        lambda x: 7500 if x.iloc[4] == 0 else x[f'{year}'], axis=1)
    
    # if more than 50k, take average of 50k and truncated result (100k since 1989 https://nlsinfo.org/content/cohorts/nlsy79/topical-guide/income/income)
    df[f'{year}'] = df.apply(
        lambda x: 75000 if x.iloc[5] == 1 else x[f'{year}'], axis=1)
           
    # if reported in 10k, only 2002 wave
    if year == 2001: 
        df[f'{year}'] = df.apply(
            lambda x: 10000* x if x.iloc[6] >= 0 else x[f'{year}'], axis=1
        )    
    return df

In [7]:
dfs = []

for year in [2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017, 2019]:
    # Get the DataFrame for the given year
    df_year = income_calc(year)
    df_year = df_year[[f'{year}']]
    dfs.append(df_year)

# Concatenate all DataFrames along the columns
result_df = pd.concat(dfs, axis=1)

combine pre-2002 and 2002-2020

In [8]:
income = pd.concat([income_pre2002,result_df],axis=1)

In [9]:
income.head()

Unnamed: 0,code,birthyear,1978,1979,1980,1981,1982,1983,1984,1985,...,2001,2003,2005,2007,2009,2011,2013,2015,2017,2019
0,1,1958,4620,-5,5000,-5,-5,-5,-5,-5,...,,,,,,,,,,
1,2,1959,4000,5000,6000,10000,11000,11500,11000,14000,...,0.0,0.0,5500.0,5000.0,6000.0,19000.0,21000.0,23000.0,25000.0,30000.0
2,3,1961,-4,7000,-5,7000,0,0,1300,0,...,0.0,,0.0,30000.0,,35000.0,40000.0,29000.0,80000.0,90000.0
3,4,1962,-4,-5,0,1086,70,0,0,7000,...,,,,,,,,73000.0,0.0,2000.0
4,5,1959,2200,2000,3400,2300,2200,10500,-5,43119,...,,,,,,,,,,


## CPI adjust (2010 = 100)

In [10]:
CPI = pd.read_csv('../US_CPI.csv')

In [11]:
CPI

Unnamed: 0,Country Name,1978,1979,1980,1981,1982,1983,1984,1985,1986,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,United States,29.915931,33.282811,37.792366,41.6981,44.254788,45.676445,47.640776,49.329949,50.266255,...,108.566932,108.695722,110.067009,112.411557,115.157303,117.244196,118.690502,124.266414,134.211206,139.735794


In [12]:
def inflation_adjuster(df, CPI):
    for year in df.columns:
        if year.isdigit() and year in CPI.columns:
            df[year] = df[year] / CPI[year].values[0] / 0.01
    return df

In [13]:
adjusted_df = inflation_adjuster(income, CPI)

Unnamed: 0,code,birthyear,1978,1979,1980,1981,1982,1983,1984,1985,...,2001,2003,2005,2007,2009,2011,2013,2015,2017,2019
0,1,1958,15443.276596,-15.02277,13230.185053,-11.990954,-11.298212,-10.946561,-10.495211,-10.13583,...,,,,,,,,,,
1,2,1959,13370.802248,15022.769548,15876.222063,23981.908164,24856.067355,25177.090863,23089.464142,28380.325382,...,0.0,0.0,6141.097931,5258.342782,6098.402607,18418.555382,19656.69141,21159.986407,22239.70613,25587.620668
2,3,1961,-13.370802,21031.877367,-13.230185,16787.335715,0.0,0.0,2728.754853,0.0,...,0.0,,0.0,31550.056689,,33928.917808,37441.316972,26679.982861,71167.059617,76762.862005
3,4,1962,-13.370802,-15.02277,0.0,2604.435227,158.174974,0.0,0.0,14190.162691,...,,,,,,,,67159.956856,0.0,1705.841378
4,5,1959,7353.941236,6009.107819,8996.525836,5515.838878,4971.213471,22987.778614,-10.495211,87409.37501,...,,,,,,,,,,


In [14]:
adjusted_df.columns

Index(['code', 'birthyear', '1978', '1979', '1980', '1981', '1982', '1983',
       '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992',
       '1993', '1995', '1997', '1999', '2001', '2003', '2005', '2007', '2009',
       '2011', '2013', '2015', '2017', '2019'],
      dtype='object')

## Pivot, calculate income by age

In [15]:
long_income = pd.melt(income, id_vars=['code', 'birthyear'], value_vars=['1978', '1979', '1980', '1981', '1982', '1983',
       '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992',
       '1993', '1995', '1997', '1999', '2001', '2003', '2005', '2007', '2009',
       '2011', '2013', '2015', '2017', '2019'], var_name='year', value_name='income', col_level=None, ignore_index=True)

In [19]:
long_income.head()

Unnamed: 0,code,birthyear,year,income
0,1,1958,1978,15443.276596
1,2,1959,1978,13370.802248
2,3,1961,1978,-13.370802
3,4,1962,1978,-13.370802
4,5,1959,1978,7353.941236


In [20]:
long_income['income'] = pd.to_numeric(long_income['income'], errors='coerce')

In [21]:
long_income['income'] = long_income['income'].where(long_income['income'] > 0, pd.NA)

In [22]:
long_income['age'] =  pd.to_numeric(long_income['year'])-long_income['birthyear']

In [25]:
wide_df = long_income.pivot_table(index=['code', 'birthyear'], columns=['age'], values=['income'])


In [26]:
wide_df = wide_df.drop(columns=[('income', 14), ('income', 15), ('income', 16), ('income', 17)])

In [30]:
wide_df.to_csv('income_by_age_CPIadjusted.csv')

In [27]:
non_nan_count = wide_df.count()

print(non_nan_count)

        age
income  18     5705
        19     6926
        20     8001
        21     9087
        22     9166
        23     9238
        24     9129
        25     9001
        26     8612
        27     8149
        28     8016
        29     7739
        30     6787
        31     6391
        32     5358
        33     4950
        34     4150
        35     4076
        36     3274
        37     3284
        38     3187
        39     3194
        40     3188
        41     3100
        42     3119
        43     3099
        44     3081
        45     3033
        46     3054
        47     2951
        48     2936
        49     2827
        50     2804
        51     2657
        52     2602
        53     2569
        54     2490
        55     2411
        56     2398
        57     1809
        58     1646
        59     1053
        60      915
        61      436
        62      404
dtype: int64
