# Overview

Based off this notebook: *IRS 990 e-File Data -- Excise Tax Project (5) -- Schedule J Part (II) -- Generate Collapsed Filing-Level DF (DONE, I BELIEVE).ipynb*

See this notebook for codebook: *IRS 990 e-File Data -- Excise Tax Project (5b) -- Schedule J Part (II) -- Create Codebook.ipynb*

Read in person-level DF: 
- *Schedule J Part II (PERSON-LEVEL DF) parsed.pkl.gz*

Add zeros to missing values for *SJ_02_PC_COMP_TOTAL*

Collapse and take highest-salary per filing

Save DV with just compensation variable: 
- *Highest Schedule J Salary per Filing (2024).pkl.gz* 

Save DF with all Schedule J Part II variables:
- *Schedule J Part II Variables for Highest Salary per Filing (N=660,974).pkl.gz*

# Load Packages and Set Working Directory

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series

In [2]:
print(pd.__version__)

2.2.2


In [3]:
from platform import python_version
print(python_version())

3.10.11


In [4]:
#http://pandas.pydata.org/pandas-docs/stable/options.html
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 2500)

In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

#### Set working directory

In [6]:
cd "C:\\Users\\Gregory\\IRS 990 Control Variables\\"

C:\Users\Gregory\IRS 990 Control Variables


# Read PANDAS DF

In [7]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
df = pd.read_pickle('Schedule J Part II (PERSON-LEVEL DF) parsed.pkl.gz', compression='gzip')
print('# of columns:', len(df.columns))
print('# of observations:', len(df))
df[:1]

Current date and time :  2025-06-27 14:07:25 

# of columns: 19
# of observations: 2972064
CPU times: total: 4.38 s
Wall time: 4.58 s


Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,SJ_02_PC_TITLE,SJ_02_PC_COMP_BASE,SJ_02_PC_COMP_BASE_RELATED,SJ_02_PC_COMP_BONUS,SJ_02_PC_COMP_BONUS_RELATED,SJ_02_PC_COMP_OTHER,SJ_02_PC_COMP_OTHER_RELATED,SJ_02_PC_COMP_DEFERRED,SJ_02_PC_COMP_DEFERRED_RELATED,SJ_02_PC_NONTAXED_BENF,SJ_02_PC_NONTAXED_BENF_RELATED,SJ_02_PC_COMP_TOTAL,SJ_02_PC_COMP_TOTAL_RELATED,SJ_02_PC_COMP_DEF_PRIOR,SJ_02_PC_COMP_DEF_PRIOR_RELATED
0,https://s3.amazonaws.com/irs-form-990/201113139349301311_public.xml,THOMAS D TURNBULL,,,,,100712.0,,790.0,,1257.0,,54308.0,,62342.0,,219409.0,,


# Collapse

In [8]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
df.describe().T

Current date and time :  2025-06-27 14:10:25 

CPU times: total: 2.81 s
Wall time: 3.59 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SJ_02_PC_COMP_BASE,2807086.0,168112.806692,10453400.0,-590430.0,0.0,150101.0,214885.0,17511320000.0
SJ_02_PC_COMP_BASE_RELATED,2601343.0,121436.737988,223368.3,-253120.0,0.0,0.0,191139.0,14572980.0
SJ_02_PC_COMP_BONUS,2556043.0,23218.015209,126126.5,-202816.0,0.0,0.0,10000.0,40364000.0
SJ_02_PC_COMP_BONUS_RELATED,2508267.0,33167.334976,209066.6,-88138.0,0.0,0.0,0.0,40364000.0
SJ_02_PC_COMP_OTHER,2560643.0,14369.395366,176893.1,-557719.0,0.0,0.0,1710.0,183883400.0
SJ_02_PC_COMP_OTHER_RELATED,2523543.0,22578.062866,210985.3,-177457.0,0.0,0.0,388.0,44353820.0
SJ_02_PC_COMP_DEFERRED,2679030.0,15138.840088,61851.93,-30985150.0,0.0,5124.0,16283.0,19325730.0
SJ_02_PC_COMP_DEFERRED_RELATED,2566118.0,17091.247414,124656.0,-30985150.0,0.0,0.0,10006.0,47206740.0
SJ_02_PC_NONTAXED_BENF,2699243.0,11467.990804,15444.9,-91965.0,0.0,7413.0,19143.0,2451027.0
SJ_02_PC_NONTAXED_BENF_RELATED,2575018.0,7586.147843,19304.52,-7357184.0,0.0,0.0,12209.0,7206054.0


# Create DF with *row* with max value of *SJ_02_PC_COMP_TOTAL*
https://stackoverflow.com/questions/15705630/get-the-rows-which-have-the-max-value-in-groups-using-groupby

All the code below this is just showing how to get, for example, the title of the employee with the maximum compensation. In the last run I was not using this, but I have now decided to include it.

In [9]:
len(df)

2972064

In [10]:
print(len(df['URL'].tolist()))
print(len(set(df['URL'].tolist())))

2972064
743685


#### Filings with $0 or missing values

In [11]:
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL = $0:', len(df[df['SJ_02_PC_COMP_TOTAL']==0]))
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL > $0:',len(df[df['SJ_02_PC_COMP_TOTAL']>0]))
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL < $0:',len(df[df['SJ_02_PC_COMP_TOTAL']<0]))
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL is missing:',len(df[df['SJ_02_PC_COMP_TOTAL'].isnull()]))

# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL = $0: 811145
# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL > $0: 2000942
# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL < $0: 71
# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL is missing: 159906


In [12]:
df[df['SJ_02_PC_COMP_TOTAL']==0][['URL', 'SJ_02_PC_NAME_OFF_TRST_KEYEMP', 'SJ_02_PC_TITLE', 'SJ_02_PC_COMP_TOTAL']].sample(5)

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_TITLE,SJ_02_PC_COMP_TOTAL
39512,https://s3.amazonaws.com/irs-form-990/201121369349304257_public.xml,Yvette M Jones,,0.0
2325739,https://s3.amazonaws.com/irs-form-990/202323189349308787_public.xml,DENNIS ESCHETE,FORMER BOARD MEMBER,0.0
2806292,https://s3.amazonaws.com/irs-form-990/201803129349302875_public.xml,LYNETTE LADENBURG,CEO,0.0
178062,https://s3.amazonaws.com/irs-form-990/201232899349301203_public.xml,Jim Lewandowski,,0.0
2632510,https://s3.amazonaws.com/irs-form-990/202421319349302312_public.xml,RICK THOMPSON MD,INTERIM PRESIDENT,0.0


In [13]:
df[df['URL']=='https://s3.amazonaws.com/irs-form-990/201223189349301257_public.xml']

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,SJ_02_PC_TITLE,SJ_02_PC_COMP_BASE,SJ_02_PC_COMP_BASE_RELATED,SJ_02_PC_COMP_BONUS,SJ_02_PC_COMP_BONUS_RELATED,SJ_02_PC_COMP_OTHER,SJ_02_PC_COMP_OTHER_RELATED,SJ_02_PC_COMP_DEFERRED,SJ_02_PC_COMP_DEFERRED_RELATED,SJ_02_PC_NONTAXED_BENF,SJ_02_PC_NONTAXED_BENF_RELATED,SJ_02_PC_COMP_TOTAL,SJ_02_PC_COMP_TOTAL_RELATED,SJ_02_PC_COMP_DEF_PRIOR,SJ_02_PC_COMP_DEF_PRIOR_RELATED
117975,https://s3.amazonaws.com/irs-form-990/201223189349301257_public.xml,LAVERNE R JOSEPH,,,,0.0,317993.0,0.0,0.0,0.0,6600.0,0.0,27067.0,0.0,71648.0,0.0,423308.0,0.0,0.0
117976,https://s3.amazonaws.com/irs-form-990/201223189349301257_public.xml,ROBERT R AMBERG,,,,0.0,268740.0,0.0,0.0,0.0,5750.0,0.0,23953.0,0.0,9099.0,0.0,307542.0,0.0,0.0
117977,https://s3.amazonaws.com/irs-form-990/201223189349301257_public.xml,STUART J HARTMAN,,,,0.0,197802.0,0.0,0.0,0.0,5400.0,0.0,16548.0,0.0,23544.0,0.0,243294.0,0.0,0.0
117978,https://s3.amazonaws.com/irs-form-990/201223189349301257_public.xml,PETER OSCAR PEABODY,,,,0.0,176301.0,0.0,0.0,0.0,5400.0,0.0,5289.0,0.0,16596.0,0.0,203586.0,0.0,0.0
117979,https://s3.amazonaws.com/irs-form-990/201223189349301257_public.xml,FRANK ROSSELLO JR,,,,0.0,181834.0,0.0,0.0,0.0,2875.0,0.0,5369.0,0.0,7610.0,0.0,197688.0,0.0,0.0
117980,https://s3.amazonaws.com/irs-form-990/201223189349301257_public.xml,RICHARD T WASHINGTON,,,,0.0,162767.0,0.0,0.0,0.0,5175.0,0.0,14494.0,0.0,7926.0,0.0,190362.0,0.0,0.0
117981,https://s3.amazonaws.com/irs-form-990/201223189349301257_public.xml,MARGARITA GUZMAN,,,,0.0,158491.0,0.0,0.0,0.0,0.0,0.0,4755.0,0.0,7926.0,0.0,171172.0,0.0,0.0
117982,https://s3.amazonaws.com/irs-form-990/201223189349301257_public.xml,VINCENT B MAGNONE,,,,0.0,152436.0,0.0,0.0,0.0,0.0,0.0,4573.0,0.0,23544.0,0.0,180553.0,0.0,0.0
117983,https://s3.amazonaws.com/irs-form-990/201223189349301257_public.xml,NADA BATTAGLIA,,,,0.0,147591.0,0.0,0.0,0.0,1200.0,0.0,13283.0,0.0,15923.0,0.0,177997.0,0.0,0.0
117984,https://s3.amazonaws.com/irs-form-990/201223189349301257_public.xml,ANDERS PLETT,,,,0.0,166025.0,0.0,0.0,0.0,4800.0,0.0,0.0,0.0,23544.0,0.0,194369.0,0.0,0.0


In [14]:
df[df['SJ_02_PC_COMP_TOTAL'].isnull()][['URL', 'SJ_02_PC_NAME_OFF_TRST_KEYEMP', 'SJ_02_PC_TITLE', 'SJ_02_PC_COMP_TOTAL']].sample(5)

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_TITLE,SJ_02_PC_COMP_TOTAL
816315,https://s3.amazonaws.com/irs-form-990/201603089349301585_public.xml,ROBIN KELLER,ASST. SEC.,
2218283,https://s3.amazonaws.com/irs-form-990/202341459349301234_public.xml,WALTER SIMPKINS,PRESIDENT/CEO,
647664,https://s3.amazonaws.com/irs-form-990/201503149349303865_public.xml,Nigel Barnes,Board Member,
2426220,https://s3.amazonaws.com/irs-form-990/202432789349300048_public.xml,CRISTA HASSETT,SVP - EVENTS,
1396421,https://s3.amazonaws.com/irs-form-990/201943189349308359_public.xml,CHRISTOPHER LOMBARD,DIR OF FINANCE,


In [15]:
df[df['URL']=='https://s3.amazonaws.com/irs-form-990/201541949349300219_public.xml']

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,SJ_02_PC_TITLE,SJ_02_PC_COMP_BASE,SJ_02_PC_COMP_BASE_RELATED,SJ_02_PC_COMP_BONUS,SJ_02_PC_COMP_BONUS_RELATED,SJ_02_PC_COMP_OTHER,SJ_02_PC_COMP_OTHER_RELATED,SJ_02_PC_COMP_DEFERRED,SJ_02_PC_COMP_DEFERRED_RELATED,SJ_02_PC_NONTAXED_BENF,SJ_02_PC_NONTAXED_BENF_RELATED,SJ_02_PC_COMP_TOTAL,SJ_02_PC_COMP_TOTAL_RELATED,SJ_02_PC_COMP_DEF_PRIOR,SJ_02_PC_COMP_DEF_PRIOR_RELATED
569924,https://s3.amazonaws.com/irs-form-990/201541949349300219_public.xml,STEPHEN VANDER SCHAAF,,,PRESIDENT/TREASURER,,391143.0,,,,,,23000.0,,14375.0,,428518.0,,
569925,https://s3.amazonaws.com/irs-form-990/201541949349300219_public.xml,BRADLEY FULLER,,,VP /SECRETARY,,195801.0,,,,,,,,58.0,,195859.0,,


#### Replace missing with zero

In [17]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2972064 entries, 0 to 2972063
Data columns (total 19 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   URL                               2972064 non-null  object 
 1   SJ_02_PC_NAME_OFF_TRST_KEYEMP     2809765 non-null  object 
 2   SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1  161718 non-null   object 
 3   SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2  249 non-null      object 
 4   SJ_02_PC_TITLE                    2578078 non-null  object 
 5   SJ_02_PC_COMP_BASE                2807086 non-null  float64
 6   SJ_02_PC_COMP_BASE_RELATED        2601343 non-null  float64
 7   SJ_02_PC_COMP_BONUS               2556043 non-null  float64
 8   SJ_02_PC_COMP_BONUS_RELATED       2508267 non-null  float64
 9   SJ_02_PC_COMP_OTHER               2560643 non-null  float64
 10  SJ_02_PC_COMP_OTHER_RELATED       2523543 non-null  float64
 11  SJ_02_PC_COMP_DEFERRED            267

In [18]:
df.sample(5)

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,SJ_02_PC_TITLE,SJ_02_PC_COMP_BASE,SJ_02_PC_COMP_BASE_RELATED,SJ_02_PC_COMP_BONUS,SJ_02_PC_COMP_BONUS_RELATED,SJ_02_PC_COMP_OTHER,SJ_02_PC_COMP_OTHER_RELATED,SJ_02_PC_COMP_DEFERRED,SJ_02_PC_COMP_DEFERRED_RELATED,SJ_02_PC_NONTAXED_BENF,SJ_02_PC_NONTAXED_BENF_RELATED,SJ_02_PC_COMP_TOTAL,SJ_02_PC_COMP_TOTAL_RELATED,SJ_02_PC_COMP_DEF_PRIOR,SJ_02_PC_COMP_DEF_PRIOR_RELATED
1483262,https://s3.amazonaws.com/irs-form-990/201933159349304123_public.xml,NANCY A GAVIN,,,ASST. SEC/TREAS,,136521.0,,139.0,,1085.0,,28409.0,,58067.0,,224221.0,,
148540,https://s3.amazonaws.com/irs-form-990/201221329349301752_public.xml,PHILLIP CLAY OUTGOING CHANCELLOR,,,,275307.0,0.0,0.0,0.0,31411.0,0.0,63945.0,0.0,26054.0,0.0,396717.0,0.0,,0.0
1797881,https://s3.amazonaws.com/irs-form-990/202221159349300742_public.xml,ANDREW COAMEY,,,SVP HOUSING DEV FACILITIES OPERATION,0.0,191635.0,0.0,20144.0,0.0,2744.0,0.0,2700.0,0.0,15105.0,0.0,232328.0,0.0,0.0
645828,https://s3.amazonaws.com/irs-form-990/201503439349300105_public.xml,PAUL BLANUSA,,,SUPERINTENDENT,131029.0,,15000.0,,6360.0,,5425.0,,19100.0,,176914.0,,,
1583609,https://s3.amazonaws.com/irs-form-990/202023219349301577_public.xml,GREG D BOSSART,,,SVP/CHIEF VETERINARY OFFCR,368391.0,0.0,121153.0,0.0,55250.0,0.0,3900.0,0.0,14831.0,0.0,563525.0,0.0,0.0,0.0


In [19]:
df[['URL', 'SJ_02_PC_COMP_TOTAL']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SJ_02_PC_COMP_TOTAL,2812158.0,227403.648312,10448380.0,-24822971.0,0.0,181619.0,271297.0,17511360000.0


In [20]:
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL = $0:', len(df[df['SJ_02_PC_COMP_TOTAL']==0]))
print('# of Sch dule J filings where max value of SJ_02_PC_COMP_TOTAL > $0:',len(df[df['SJ_02_PC_COMP_TOTAL']>0]))
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL < $0:',len(df[df['SJ_02_PC_COMP_TOTAL']<0]))
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL is missing:',len(df[df['SJ_02_PC_COMP_TOTAL'].isnull()]))

# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL = $0: 811145
# of Sch dule J filings where max value of SJ_02_PC_COMP_TOTAL > $0: 2000942
# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL < $0: 71
# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL is missing: 159906


In [21]:
print(len(df[df['SJ_02_PC_COMP_TOTAL'].isnull()]))
df['SJ_02_PC_COMP_TOTAL'] = np.where(df['SJ_02_PC_COMP_TOTAL'].isnull(), 0, df['SJ_02_PC_COMP_TOTAL'])
print(len(df[df['SJ_02_PC_COMP_TOTAL'].isnull()]))

159906
0


In [22]:
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL = $0:', len(df[df['SJ_02_PC_COMP_TOTAL']==0]))
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL > $0:',len(df[df['SJ_02_PC_COMP_TOTAL']>0]))
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL < $0:',len(df[df['SJ_02_PC_COMP_TOTAL']<0]))
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL is missing:',len(df[df['SJ_02_PC_COMP_TOTAL'].isnull()]))

# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL = $0: 971051
# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL > $0: 2000942
# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL < $0: 71
# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL is missing: 0


In [23]:
df.sample(5)

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,SJ_02_PC_TITLE,SJ_02_PC_COMP_BASE,SJ_02_PC_COMP_BASE_RELATED,SJ_02_PC_COMP_BONUS,SJ_02_PC_COMP_BONUS_RELATED,SJ_02_PC_COMP_OTHER,SJ_02_PC_COMP_OTHER_RELATED,SJ_02_PC_COMP_DEFERRED,SJ_02_PC_COMP_DEFERRED_RELATED,SJ_02_PC_NONTAXED_BENF,SJ_02_PC_NONTAXED_BENF_RELATED,SJ_02_PC_COMP_TOTAL,SJ_02_PC_COMP_TOTAL_RELATED,SJ_02_PC_COMP_DEF_PRIOR,SJ_02_PC_COMP_DEF_PRIOR_RELATED
724057,https://s3.amazonaws.com/irs-form-990/201631349349307828_public.xml,FRED KING,,,DIRECTOR,0.0,232914.0,0.0,0.0,0.0,0.0,0.0,14202.0,0.0,8604.0,0.0,255720.0,0.0,0.0
1966502,https://s3.amazonaws.com/irs-form-990/202131669349301968_public.xml,MICHAEL GUERRA,,,CDO,156448.0,0.0,24408.0,0.0,0.0,0.0,0.0,0.0,10365.0,0.0,191221.0,0.0,0.0,0.0
2823123,https://s3.amazonaws.com/irs-form-990/201932499349300208_public.xml,LINDA BEUSHAUSEN,,,CEO,127876.0,0.0,0.0,0.0,8286.0,0.0,9490.0,0.0,10992.0,0.0,156644.0,0.0,0.0,0.0
2165198,https://s3.amazonaws.com/irs-form-990/202311309349303801_public.xml,ROBIN THURSTON,,,MANAGING SR COUNSEL,195333.0,,0.0,,0.0,,7540.0,,0.0,,202873.0,,0.0,
1948073,https://s3.amazonaws.com/irs-form-990/202122889349301312_public.xml,STEVEN SILBERSTEIN,,,CEO,650872.0,0.0,340000.0,0.0,3810.0,0.0,24999.0,0.0,19078.0,0.0,1038759.0,0.0,0.0,0.0


#### Sort

In [24]:
print(len(df))
df = df.sort_values('SJ_02_PC_COMP_TOTAL', ascending=False)#.drop_duplicates(['URL'],keep='first')
print(len(df))

2972064
2972064


In [25]:
df[df['URL']=='https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml'][['URL', 'SJ_02_PC_NAME_OFF_TRST_KEYEMP', 'SJ_02_PC_COMP_TOTAL']]

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_COMP_TOTAL
29,https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml,Robert Wolterman,263223.0
30,https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml,Paolo Zambito,244812.0
24,https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml,Mark Eckert,211371.0
27,https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml,Eddy Ramirez,187315.0
25,https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml,Sylvia D Hartmann,175980.0
31,https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml,Bradley Goodson,160189.0
32,https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml,Suzanne Warren,150113.0
20,https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml,Scott J Posecai,0.0
19,https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml,Bobby C Brannon,0.0
18,https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml,F R Bobby Rodwig Jr MD,0.0


In [26]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
dfa2 = df.sort_values('SJ_02_PC_COMP_TOTAL', ascending=False).drop_duplicates(['URL'],keep='first')
print(len(dfa2))
dfa2[:1]

Current date and time :  2025-06-27 14:11:38 

743685
CPU times: total: 1.89 s
Wall time: 1.99 s


Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,SJ_02_PC_TITLE,SJ_02_PC_COMP_BASE,SJ_02_PC_COMP_BASE_RELATED,SJ_02_PC_COMP_BONUS,SJ_02_PC_COMP_BONUS_RELATED,SJ_02_PC_COMP_OTHER,SJ_02_PC_COMP_OTHER_RELATED,SJ_02_PC_COMP_DEFERRED,SJ_02_PC_COMP_DEFERRED_RELATED,SJ_02_PC_NONTAXED_BENF,SJ_02_PC_NONTAXED_BENF_RELATED,SJ_02_PC_COMP_TOTAL,SJ_02_PC_COMP_TOTAL_RELATED,SJ_02_PC_COMP_DEF_PRIOR,SJ_02_PC_COMP_DEF_PRIOR_RELATED
2756613,https://s3.amazonaws.com/irs-form-990/201610749349300436_public.xml,KATHY CAMPBELL,,,PRESIDENT/CEO,17511320000.0,,19080.0,,,,26011.0,,,,17511360000.0,,,


In [27]:
dfa2[dfa2['URL']=='https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml'][['URL', 'SJ_02_PC_NAME_OFF_TRST_KEYEMP', 'SJ_02_PC_COMP_TOTAL']]

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_COMP_TOTAL
29,https://s3.amazonaws.com/irs-form-990/201113139349301326_public.xml,Robert Wolterman,263223.0


<br>Check data

In [28]:
print(len(set(dfa2['URL'].tolist())))

743685


In [29]:
dfa2[dfa2['URL']=='https://s3.amazonaws.com/irs-form-990/201610749349300436_public.xml']

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,SJ_02_PC_TITLE,SJ_02_PC_COMP_BASE,SJ_02_PC_COMP_BASE_RELATED,SJ_02_PC_COMP_BONUS,SJ_02_PC_COMP_BONUS_RELATED,SJ_02_PC_COMP_OTHER,SJ_02_PC_COMP_OTHER_RELATED,SJ_02_PC_COMP_DEFERRED,SJ_02_PC_COMP_DEFERRED_RELATED,SJ_02_PC_NONTAXED_BENF,SJ_02_PC_NONTAXED_BENF_RELATED,SJ_02_PC_COMP_TOTAL,SJ_02_PC_COMP_TOTAL_RELATED,SJ_02_PC_COMP_DEF_PRIOR,SJ_02_PC_COMP_DEF_PRIOR_RELATED
2756613,https://s3.amazonaws.com/irs-form-990/201610749349300436_public.xml,KATHY CAMPBELL,,,PRESIDENT/CEO,17511320000.0,,19080.0,,,,26011.0,,,,17511360000.0,,,


In [30]:
dfa2[['URL', 'SJ_02_PC_NAME_OFF_TRST_KEYEMP', 'SJ_02_PC_TITLE', 'SJ_02_PC_COMP_TOTAL']].sample(5)

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_TITLE,SJ_02_PC_COMP_TOTAL
1312507,https://s3.amazonaws.com/irs-form-990/201912279349300601_public.xml,JUDITH WOLLACK,CEO,247413.0
1274403,https://s3.amazonaws.com/irs-form-990/201931359349304373_public.xml,GARY L WEITZMAN DVM MPH,PRESIDENT & CEO,269881.0
288479,https://s3.amazonaws.com/irs-form-990/201311349349307776_public.xml,KEVIN HONIGFORD,,0.0
2846268,https://s3.amazonaws.com/irs-form-990/202023179349302147_public.xml,WILLIAM MCLENNAN,EXECUTIVE DIRECTOR NONVOTI,188129.0
908805,https://s3.amazonaws.com/irs-form-990/201701329349301010_public.xml,RICHARD SEAGER,PRESIDENT/CEO,0.0


In [31]:
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL = $0:', len(dfa2[dfa2['SJ_02_PC_COMP_TOTAL']==0]))
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL > $0:',len(dfa2[dfa2['SJ_02_PC_COMP_TOTAL']>0]))
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL < $0:',len(dfa2[dfa2['SJ_02_PC_COMP_TOTAL']<0]))
print('# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL is missing:',len(dfa2[dfa2['SJ_02_PC_COMP_TOTAL'].isnull()]))

# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL = $0: 212140
# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL > $0: 531542
# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL < $0: 3
# of Schedule J filings where max value of SJ_02_PC_COMP_TOTAL is missing: 0


In [32]:
df[df['URL']=='https://s3.amazonaws.com/irs-form-990/201541949349300219_public.xml']

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,SJ_02_PC_TITLE,SJ_02_PC_COMP_BASE,SJ_02_PC_COMP_BASE_RELATED,SJ_02_PC_COMP_BONUS,SJ_02_PC_COMP_BONUS_RELATED,SJ_02_PC_COMP_OTHER,SJ_02_PC_COMP_OTHER_RELATED,SJ_02_PC_COMP_DEFERRED,SJ_02_PC_COMP_DEFERRED_RELATED,SJ_02_PC_NONTAXED_BENF,SJ_02_PC_NONTAXED_BENF_RELATED,SJ_02_PC_COMP_TOTAL,SJ_02_PC_COMP_TOTAL_RELATED,SJ_02_PC_COMP_DEF_PRIOR,SJ_02_PC_COMP_DEF_PRIOR_RELATED
569924,https://s3.amazonaws.com/irs-form-990/201541949349300219_public.xml,STEPHEN VANDER SCHAAF,,,PRESIDENT/TREASURER,,391143.0,,,,,,23000.0,,14375.0,0.0,428518.0,,
569925,https://s3.amazonaws.com/irs-form-990/201541949349300219_public.xml,BRADLEY FULLER,,,VP /SECRETARY,,195801.0,,,,,,,,58.0,0.0,195859.0,,


#### Save DF with just total compensation variable

In [33]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
print(len(dfa2))
dfa2[['URL', 'SJ_02_PC_COMP_TOTAL']].to_pickle('Highest Schedule J Salary per Filing (2025).pkl.gz', compression='gzip')

Current date and time :  2025-06-27 14:12:09 

743685
CPU times: total: 11.2 s
Wall time: 12.8 s


#### Read in codebook
From *IRS 990 e-File Data -- Excise Tax Project (5b) -- Schedule J Part (II) -- Create Codebook.ipynb*

In [34]:
codebook = pd.read_pickle('Codebook - Schedule J (Part II).pkl')
codebook[:2]

Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,URL,Unique identifier for filing -- use for merging with filings-level datasets (also for seeing raw filing online),,
1,SJ_02_PC_NAME_OFF_TRST_KEYEMP,Name of officer - person,SCHED-J-PART-02-COL-A-(i),PersonNameType


#### Rename columns and save DF

In [35]:
print(dfa2.columns.tolist())

['URL', 'SJ_02_PC_NAME_OFF_TRST_KEYEMP', 'SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1', 'SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2', 'SJ_02_PC_TITLE', 'SJ_02_PC_COMP_BASE', 'SJ_02_PC_COMP_BASE_RELATED', 'SJ_02_PC_COMP_BONUS', 'SJ_02_PC_COMP_BONUS_RELATED', 'SJ_02_PC_COMP_OTHER', 'SJ_02_PC_COMP_OTHER_RELATED', 'SJ_02_PC_COMP_DEFERRED', 'SJ_02_PC_COMP_DEFERRED_RELATED', 'SJ_02_PC_NONTAXED_BENF', 'SJ_02_PC_NONTAXED_BENF_RELATED', 'SJ_02_PC_COMP_TOTAL', 'SJ_02_PC_COMP_TOTAL_RELATED', 'SJ_02_PC_COMP_DEF_PRIOR', 'SJ_02_PC_COMP_DEF_PRIOR_RELATED']


In [36]:
dfa2[:2]

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,SJ_02_PC_TITLE,SJ_02_PC_COMP_BASE,SJ_02_PC_COMP_BASE_RELATED,SJ_02_PC_COMP_BONUS,SJ_02_PC_COMP_BONUS_RELATED,SJ_02_PC_COMP_OTHER,SJ_02_PC_COMP_OTHER_RELATED,SJ_02_PC_COMP_DEFERRED,SJ_02_PC_COMP_DEFERRED_RELATED,SJ_02_PC_NONTAXED_BENF,SJ_02_PC_NONTAXED_BENF_RELATED,SJ_02_PC_COMP_TOTAL,SJ_02_PC_COMP_TOTAL_RELATED,SJ_02_PC_COMP_DEF_PRIOR,SJ_02_PC_COMP_DEF_PRIOR_RELATED
2756613,https://s3.amazonaws.com/irs-form-990/201610749349300436_public.xml,KATHY CAMPBELL,,,PRESIDENT/CEO,17511320000.0,,19080.0,,,,26011.0,,,,17511360000.0,,,
2447388,https://s3.amazonaws.com/irs-form-990/202422499349301302_public.xml,JARED CHORNEY,,,GENERAL MANAGER,216346.0,0.0,12500.0,0.0,183883428.0,0.0,0.0,0.0,5450.0,0.0,184117700.0,0.0,0.0,0.0


In [37]:
dfa2.columns = ['URL', 'max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP', 'max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1', 
                  'max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2', 'max_comp_SJ_02_PC_TITLE', 'max_comp_SJ_02_PC_COMP_BASE', 
                  'max_comp_SJ_02_PC_COMP_BASE_RELATED', 'max_comp_SJ_02_PC_COMP_BONUS', 
                  'max_comp_SJ_02_PC_COMP_BONUS_RELATED', 'max_comp_SJ_02_PC_COMP_OTHER', 
                  'max_comp_SJ_02_PC_COMP_OTHER_RELATED', 'max_comp_SJ_02_PC_COMP_DEFERRED', 
                  'max_comp_SJ_02_PC_COMP_DEFERRED_RELATED', 'max_comp_SJ_02_PC_NONTAXED_BENF',
                  'max_comp_SJ_02_PC_NONTAXED_BENF_RELATED', 'max_comp_SJ_02_PC_COMP_TOTAL', 
                  'max_comp_SJ_02_PC_COMP_TOTAL_RELATED', 'max_comp_SJ_02_PC_COMP_DEF_PRIOR', 
                  'max_comp_SJ_02_PC_COMP_DEF_PRIOR_RELATED']
dfa2[:2]

Unnamed: 0,URL,max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP,max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,max_comp_SJ_02_PC_TITLE,max_comp_SJ_02_PC_COMP_BASE,max_comp_SJ_02_PC_COMP_BASE_RELATED,max_comp_SJ_02_PC_COMP_BONUS,max_comp_SJ_02_PC_COMP_BONUS_RELATED,max_comp_SJ_02_PC_COMP_OTHER,max_comp_SJ_02_PC_COMP_OTHER_RELATED,max_comp_SJ_02_PC_COMP_DEFERRED,max_comp_SJ_02_PC_COMP_DEFERRED_RELATED,max_comp_SJ_02_PC_NONTAXED_BENF,max_comp_SJ_02_PC_NONTAXED_BENF_RELATED,max_comp_SJ_02_PC_COMP_TOTAL,max_comp_SJ_02_PC_COMP_TOTAL_RELATED,max_comp_SJ_02_PC_COMP_DEF_PRIOR,max_comp_SJ_02_PC_COMP_DEF_PRIOR_RELATED
2756613,https://s3.amazonaws.com/irs-form-990/201610749349300436_public.xml,KATHY CAMPBELL,,,PRESIDENT/CEO,17511320000.0,,19080.0,,,,26011.0,,,,17511360000.0,,,
2447388,https://s3.amazonaws.com/irs-form-990/202422499349301302_public.xml,JARED CHORNEY,,,GENERAL MANAGER,216346.0,0.0,12500.0,0.0,183883428.0,0.0,0.0,0.0,5450.0,0.0,184117700.0,0.0,0.0,0.0


In [38]:
dfa2.sample(5)

Unnamed: 0,URL,max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP,max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,max_comp_SJ_02_PC_TITLE,max_comp_SJ_02_PC_COMP_BASE,max_comp_SJ_02_PC_COMP_BASE_RELATED,max_comp_SJ_02_PC_COMP_BONUS,max_comp_SJ_02_PC_COMP_BONUS_RELATED,max_comp_SJ_02_PC_COMP_OTHER,max_comp_SJ_02_PC_COMP_OTHER_RELATED,max_comp_SJ_02_PC_COMP_DEFERRED,max_comp_SJ_02_PC_COMP_DEFERRED_RELATED,max_comp_SJ_02_PC_NONTAXED_BENF,max_comp_SJ_02_PC_NONTAXED_BENF_RELATED,max_comp_SJ_02_PC_COMP_TOTAL,max_comp_SJ_02_PC_COMP_TOTAL_RELATED,max_comp_SJ_02_PC_COMP_DEF_PRIOR,max_comp_SJ_02_PC_COMP_DEF_PRIOR_RELATED
1747539,https://s3.amazonaws.com/irs-form-990/202201369349307130_public.xml,JEFFREY DUNN,,,PRESIDENT/CEO (THRU 12/31/2020),796494.0,0.0,285780.0,0.0,7953.0,0.0,28500.0,0.0,33082.0,0.0,1151809.0,0.0,0.0,0.0
1447368,https://s3.amazonaws.com/irs-form-990/201923169349303832_public.xml,JOHN VALONE,,,BUSINESS MANAGER/PRESIDENT,167753.0,0.0,0.0,0.0,4931.0,0.0,41458.0,0.0,32964.0,0.0,247106.0,0.0,0.0,0.0
1299581,https://s3.amazonaws.com/irs-form-990/201901939349301705_public.xml,SEAN WHITELEY-ROSS,,,SECRETARY/TREASURER,0.0,243536.0,0.0,43061.0,0.0,0.0,0.0,14391.0,0.0,9781.0,0.0,310769.0,0.0,0.0
1696052,https://s3.amazonaws.com/irs-form-990/202001189349300750_public.xml,ROBERT S REYNOLDS,,,,173675.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173675.0,0.0,0.0,0.0
1788452,https://s3.amazonaws.com/irs-form-990/202212279349304671_public.xml,J Todd Harbour,,,General Manager & COO,342069.0,,64000.0,,2700.0,,9750.0,,,,418519.0,,,


In [39]:
pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [40]:
dfa2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
max_comp_SJ_02_PC_COMP_BASE,698701.0,213545.6,20950527.3,-316657.0,4949.0,163900.0,240000.0,17511315603.0
max_comp_SJ_02_PC_COMP_BASE_RELATED,605657.0,93799.0,174779.3,-11468.0,0.0,0.0,163912.0,12403566.0
max_comp_SJ_02_PC_COMP_BONUS,592999.0,29953.3,205088.4,-68450.0,0.0,0.0,7850.0,40364000.0
max_comp_SJ_02_PC_COMP_BONUS_RELATED,573685.0,16346.0,185914.2,-52964.0,0.0,0.0,0.0,40364000.0
max_comp_SJ_02_PC_COMP_OTHER,588830.0,22527.6,337758.3,-557719.0,0.0,0.0,758.0,183883428.0
max_comp_SJ_02_PC_COMP_OTHER_RELATED,575745.0,11074.3,137731.3,-44000.0,0.0,0.0,0.0,29659381.0
max_comp_SJ_02_PC_COMP_DEFERRED,632483.0,18644.6,95400.0,-2508768.0,0.0,4134.0,16200.0,19325730.0
max_comp_SJ_02_PC_COMP_DEFERRED_RELATED,589886.0,10341.6,91454.9,-832814.0,0.0,0.0,5137.8,47206737.0
max_comp_SJ_02_PC_NONTAXED_BENF,639124.0,12255.4,18980.7,-39446.0,0.0,7245.0,19170.0,1809430.0
max_comp_SJ_02_PC_NONTAXED_BENF_RELATED,593584.0,6327.7,17866.0,-1520001.0,0.0,0.0,8373.0,7206054.0


#### Save DF

In [41]:
print(len(df))
print(len(dfa2))
print(len(set(df['URL'].tolist())))

2972064
743685
743685


In [42]:
dfa2.sample(2)

Unnamed: 0,URL,max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP,max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,max_comp_SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,max_comp_SJ_02_PC_TITLE,max_comp_SJ_02_PC_COMP_BASE,max_comp_SJ_02_PC_COMP_BASE_RELATED,max_comp_SJ_02_PC_COMP_BONUS,max_comp_SJ_02_PC_COMP_BONUS_RELATED,max_comp_SJ_02_PC_COMP_OTHER,max_comp_SJ_02_PC_COMP_OTHER_RELATED,max_comp_SJ_02_PC_COMP_DEFERRED,max_comp_SJ_02_PC_COMP_DEFERRED_RELATED,max_comp_SJ_02_PC_NONTAXED_BENF,max_comp_SJ_02_PC_NONTAXED_BENF_RELATED,max_comp_SJ_02_PC_COMP_TOTAL,max_comp_SJ_02_PC_COMP_TOTAL_RELATED,max_comp_SJ_02_PC_COMP_DEF_PRIOR,max_comp_SJ_02_PC_COMP_DEF_PRIOR_RELATED
321750,https://s3.amazonaws.com/irs-form-990/201342269349300314_public.xml,DANIEL W HOLBERT,,,,0.0,139536.0,0.0,23158.0,0.0,10443.0,0.0,5496.0,0.0,8553.0,0.0,187186.0,0.0,0.0
2451763,https://s3.amazonaws.com/irs-form-990/202442579349301034_public.xml,MOLLY SWAIN,,,FRMR INTERIM EXECUTIVE DIRECTOR,0.0,149995.0,0.0,18799.0,0.0,3855.0,0.0,18119.0,0.0,11445.0,0.0,202213.0,0.0,0.0


In [43]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
dfa2.to_pickle('Schedule J Part II Variables for Highest Salary per Filing (N=743,685).pkl.gz', compression='gzip')

Current date and time :  2025-06-27 14:13:08 

CPU times: total: 52.9 s
Wall time: 54.9 s
