# Overview
Create combined codebook and save files:
- *Codebook - Schedule J (Part I).pkl*
- *Codebook - Schedule J (Part I).xlsx*

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series

In [2]:
print(pd.__version__)

1.4.3


In [3]:
from platform import python_version
print(python_version())

3.8.13


In [4]:
# http://pandas.pydata.org/pandas-docs/stable/options.html
pd.set_option('display.max_columns', None)
# http://pandas.pydata.org/pandas-docs/stable/options.html
pd.set_option('display.max_colwidth', 500)

In [5]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [6]:
pd.options.display.float_format = '{:,.2f}'.format

#### Set working directory

In [7]:
cd "C:\\Users\\Gregory\\IRS 990 Control Variables\\"

C:\Users\Gregory\IRS 990 Control Variables


# Read in Control Variables

In [8]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
df = pd.read_pickle('Excise Tax Project - Schedule J (Part I) - parsed.pkl.gz', compression='gzip')
print('# of columns:', len(df.columns))
print('# of observations:', len(df))
df[:1]

Current date and time :  2023-01-09 14:13:00 

# of columns: 29
# of observations: 497701
Wall time: 1.03 s


Unnamed: 0,OrganizationName,URL,EIN,SJ_01_PC_BOARD_APPROVAL,SJ_01_PC_CLUB_FEES,SJ_01_PC_COMPANION_TRAVEL,SJ_01_PC_COMPENSATION_COMMITTEE,SJ_01_PC_COMPENSATION_SURVEY,SJ_01_PC_CONSULTANT,SJ_01_PC_CONTINGENT_NET_OWN,SJ_01_PC_CONTINGENT_NET_RELATED,SJ_01_PC_CONTINGENT_REV_OWN,SJ_01_PC_CONTINGENT_REV_RELATED,SJ_01_PC_CONTRACT,SJ_01_PC_CONTRACT_EXCEPTION,SJ_01_PC_DISCRETIONARY_ACCOUNT,SJ_01_PC_EQUITY_BASED_COMP,SJ_01_PC_FIRST_CLASS_TRAVEL,SJ_01_PC_HOME_OFFICE_SUBSIDY,SJ_01_PC_HOUSING_ALLOWANCE,SJ_01_PC_INDEMNIFICATION,SJ_01_PC_NON_FIXED_PAYMENTS,SJ_01_PC_OTHER_ORGS_990,SJ_01_PC_PERSONAL_SERVICES,SJ_01_PC_REBUTTABLE_PRESUMPTION,SJ_01_PC_SEVERANCE,SJ_01_PC_SUBSTANTIATION_REQUIRED,SJ_01_PC_SUPPLEMENTAL_RETIREMENT,SJ_01_PC_WRITTEN_POLICY
0,TORRINGTON VOA ELDERLY HOUSING INC BELL PARK TOWER,https://s3.amazonaws.com/irs-form-990/201113139349301311_public.xml,581805618,1.0,,,1.0,1.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,,,,,0.0,,,0.0,0.0,,1.0,


# Read in Concordance File

In [9]:
%%time
concordance = pd.read_excel('concordance - Schedule J (Part I).xlsx')
print('# of columns:', len(concordance.columns))
print('# of observations:', len(concordance))
concordance[:1]

# of columns: 15
# of observations: 52
Wall time: 2.4 s


Unnamed: 0,xpath,variable_name_new,# of Characters (newly named),variable name notes,PARSING NOTES,OTHER NOTES,description,location_code,part,data_type_xsd,fill_null,BINARIZE,MongoDB_Name,sub_key,sub_sub_key
0,/Return/ReturnData/IRS990ScheduleJ/ClubDuesOrFees,SJ_01_PC_CLUB_FEES,,,,,Club dues or fees,SCHED-J-PART-01-LINE-1a,PART-01,CheckboxType,,binarize,ClubDuesOrFees,,


In [10]:
def agg_funcs(x):
    names = {
        'description': x['description'].head(1).values[0],
        'location_code': x['location_code'].head(1).values[0],        
        'data_type_xsd': x['data_type_xsd'].head(1).values[0],
        #'python_data_type': x['python_data_type'].head(1).values[0],
        #'PARSING NOTES': x['PARSING NOTES'].head(1).values[0],       
        #'OTHER NOTES': x['OTHER NOTES'].head(1).values[0],       
    }
    #THE FOLLOWING SHORTCUT WORKS BUT CHANGES THE ORDER OF THE COLUMNS
    #return pd.Series(names, index = list(names.keys()))
    return pd.Series(names, index=['description', 'location_code', 'data_type_xsd', #'python_data_type', 
                                  #'PARSING NOTES', 'OTHER NOTES'
                                  ])
new_variables_df = concordance.groupby(['variable_name_new']).apply(agg_funcs)
new_variables_df = new_variables_df.reset_index()
print('# of variables:', len(new_variables_df))
new_variables_df[:]

# of variables: 26


Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,SJ_01_PC_BOARD_APPROVAL,Board or committee approval,SCHED-J-PART-01-LINE-3,CheckboxType
1,SJ_01_PC_CLUB_FEES,Club dues or fees,SCHED-J-PART-01-LINE-1a,CheckboxType
2,SJ_01_PC_COMPANION_TRAVEL,Travel for companions,SCHED-J-PART-01-LINE-1a,CheckboxType
3,SJ_01_PC_COMPENSATION_COMMITTEE,Compensation committee,SCHED-J-PART-01-LINE-3,CheckboxType
4,SJ_01_PC_COMPENSATION_SURVEY,Compensation survey,SCHED-J-PART-01-LINE-3,CheckboxType
5,SJ_01_PC_CONSULTANT,Independent consultant,SCHED-J-PART-01-LINE-3,CheckboxType
6,SJ_01_PC_CONTINGENT_NET_OWN,Compensation based on net earnings of filing org?,SCHED-J-PART-01-LINE-6a,BooleanType
7,SJ_01_PC_CONTINGENT_NET_RELATED,Compensation based on net earnings of related orgs?,SCHED-J-PART-01-LINE-6b,BooleanType
8,SJ_01_PC_CONTINGENT_REV_OWN,Compensation based on revenue of filing org?,SCHED-J-PART-01-LINE-5a,BooleanType
9,SJ_01_PC_CONTINGENT_REV_RELATED,Compensation based on revenue of related orgs?,SCHED-J-PART-01-LINE-5b,BooleanType


In [11]:
concordance[concordance['description'].isnull()]

Unnamed: 0,xpath,variable_name_new,# of Characters (newly named),variable name notes,PARSING NOTES,OTHER NOTES,description,location_code,part,data_type_xsd,fill_null,BINARIZE,MongoDB_Name,sub_key,sub_sub_key


In [12]:
new_variables_df[new_variables_df['description'].isnull()]

Unnamed: 0,variable_name_new,description,location_code,data_type_xsd


In [13]:
for index, row in new_variables_df.iterrows():
    #if pd.isnull(row['python_data_type']):
    #    print('Missing *python_data_type*', row['variable_name_new'], row['python_data_type'], '\n')
    if pd.isnull(row['location_code']):
        print('Missing *location_code*',  row['variable_name_new'], '.....', row['location_code'], row['description'], '\n')         
    elif pd.isnull(row['data_type_xsd']):
        print('Missing *data_type_xsd*', row['variable_name_new'], row['data_type_xsd'], '\n')
    elif pd.isnull(row['description']):
        print('Missing *description*', row['variable_name_new'], '.....', row['description'])         

In [14]:
set(new_variables_df['variable_name_new'].tolist()) - set(df.columns.tolist())

set()

In [15]:
set(df.columns.tolist()) - set(new_variables_df['variable_name_new'].tolist())

{'EIN', 'OrganizationName', 'URL'}

In [16]:
df[:1]

Unnamed: 0,OrganizationName,URL,EIN,SJ_01_PC_BOARD_APPROVAL,SJ_01_PC_CLUB_FEES,SJ_01_PC_COMPANION_TRAVEL,SJ_01_PC_COMPENSATION_COMMITTEE,SJ_01_PC_COMPENSATION_SURVEY,SJ_01_PC_CONSULTANT,SJ_01_PC_CONTINGENT_NET_OWN,SJ_01_PC_CONTINGENT_NET_RELATED,SJ_01_PC_CONTINGENT_REV_OWN,SJ_01_PC_CONTINGENT_REV_RELATED,SJ_01_PC_CONTRACT,SJ_01_PC_CONTRACT_EXCEPTION,SJ_01_PC_DISCRETIONARY_ACCOUNT,SJ_01_PC_EQUITY_BASED_COMP,SJ_01_PC_FIRST_CLASS_TRAVEL,SJ_01_PC_HOME_OFFICE_SUBSIDY,SJ_01_PC_HOUSING_ALLOWANCE,SJ_01_PC_INDEMNIFICATION,SJ_01_PC_NON_FIXED_PAYMENTS,SJ_01_PC_OTHER_ORGS_990,SJ_01_PC_PERSONAL_SERVICES,SJ_01_PC_REBUTTABLE_PRESUMPTION,SJ_01_PC_SEVERANCE,SJ_01_PC_SUBSTANTIATION_REQUIRED,SJ_01_PC_SUPPLEMENTAL_RETIREMENT,SJ_01_PC_WRITTEN_POLICY
0,TORRINGTON VOA ELDERLY HOUSING INC BELL PARK TOWER,https://s3.amazonaws.com/irs-form-990/201113139349301311_public.xml,581805618,1.0,,,1.0,1.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,,,,,0.0,,,0.0,0.0,,1.0,


In [17]:
print(len(df))
#df['501c3'].value_counts()

497701


In [18]:
no_conc = ['EIN',
 'OrganizationName',
 'URL']
vals = ['EIN', 
        'Organization Name',  
        'Unique identifier for filing -- use for merging with filings-level datasets (also for seeing raw filing online)']

#list(zip(lst, lst2)),
no_concordance_df = pd.DataFrame(list(zip(no_conc,vals)), columns =['variable_name_new', 'description'])
no_concordance_df

Unnamed: 0,variable_name_new,description
0,EIN,EIN
1,OrganizationName,Organization Name
2,URL,Unique identifier for filing -- use for merging with filings-level datasets (also for seeing raw filing online)


In [19]:
print(len(new_variables_df))
print(len(new_variables_df.append(no_concordance_df)))
new_variables_df = new_variables_df.append(no_concordance_df)
print(len(new_variables_df))

26
29
29


  print(len(new_variables_df.append(no_concordance_df)))
  new_variables_df = new_variables_df.append(no_concordance_df)


In [20]:
new_variables_df[new_variables_df['variable_name_new'].isin(no_conc)]

Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,EIN,EIN,,
1,OrganizationName,Organization Name,,
2,URL,Unique identifier for filing -- use for merging with filings-level datasets (also for seeing raw filing online),,


### Inspect

In [21]:
new_variables_df[new_variables_df['description'].isnull()]

Unnamed: 0,variable_name_new,description,location_code,data_type_xsd


In [22]:
set(new_variables_df['variable_name_new'].tolist()) - set(df.columns.tolist())

set()

In [23]:
set(df.columns.tolist()) - set(new_variables_df['variable_name_new'].tolist())

set()

In [24]:
df[:1]

Unnamed: 0,OrganizationName,URL,EIN,SJ_01_PC_BOARD_APPROVAL,SJ_01_PC_CLUB_FEES,SJ_01_PC_COMPANION_TRAVEL,SJ_01_PC_COMPENSATION_COMMITTEE,SJ_01_PC_COMPENSATION_SURVEY,SJ_01_PC_CONSULTANT,SJ_01_PC_CONTINGENT_NET_OWN,SJ_01_PC_CONTINGENT_NET_RELATED,SJ_01_PC_CONTINGENT_REV_OWN,SJ_01_PC_CONTINGENT_REV_RELATED,SJ_01_PC_CONTRACT,SJ_01_PC_CONTRACT_EXCEPTION,SJ_01_PC_DISCRETIONARY_ACCOUNT,SJ_01_PC_EQUITY_BASED_COMP,SJ_01_PC_FIRST_CLASS_TRAVEL,SJ_01_PC_HOME_OFFICE_SUBSIDY,SJ_01_PC_HOUSING_ALLOWANCE,SJ_01_PC_INDEMNIFICATION,SJ_01_PC_NON_FIXED_PAYMENTS,SJ_01_PC_OTHER_ORGS_990,SJ_01_PC_PERSONAL_SERVICES,SJ_01_PC_REBUTTABLE_PRESUMPTION,SJ_01_PC_SEVERANCE,SJ_01_PC_SUBSTANTIATION_REQUIRED,SJ_01_PC_SUPPLEMENTAL_RETIREMENT,SJ_01_PC_WRITTEN_POLICY
0,TORRINGTON VOA ELDERLY HOUSING INC BELL PARK TOWER,https://s3.amazonaws.com/irs-form-990/201113139349301311_public.xml,581805618,1.0,,,1.0,1.0,,0.0,0.0,0.0,0.0,,0.0,,0.0,,,,,0.0,,,0.0,0.0,,1.0,


In [25]:
new_variables_df[:1]

Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,SJ_01_PC_BOARD_APPROVAL,Board or committee approval,SCHED-J-PART-01-LINE-3,CheckboxType


#### Reset Index

In [26]:
new_variables_df = new_variables_df.set_index('variable_name_new')
new_variables_df[:5]

Unnamed: 0_level_0,description,location_code,data_type_xsd
variable_name_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SJ_01_PC_BOARD_APPROVAL,Board or committee approval,SCHED-J-PART-01-LINE-3,CheckboxType
SJ_01_PC_CLUB_FEES,Club dues or fees,SCHED-J-PART-01-LINE-1a,CheckboxType
SJ_01_PC_COMPANION_TRAVEL,Travel for companions,SCHED-J-PART-01-LINE-1a,CheckboxType
SJ_01_PC_COMPENSATION_COMMITTEE,Compensation committee,SCHED-J-PART-01-LINE-3,CheckboxType
SJ_01_PC_COMPENSATION_SURVEY,Compensation survey,SCHED-J-PART-01-LINE-3,CheckboxType


In [27]:
len(new_variables_df)

29

In [28]:
len(df.columns.tolist())

29

In [29]:
print(len(new_variables_df.reindex(df.columns.tolist())))
new_variables_df = new_variables_df.reindex(df.columns.tolist())
print(len(new_variables_df))

29
29


In [30]:
print(len(new_variables_df.reindex(df.columns.tolist())))
new_variables_df = new_variables_df.reset_index()
print(len(new_variables_df))
new_variables_df[:1]

29
29


Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,OrganizationName,Organization Name,,


In [31]:
new_variables_df

Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,OrganizationName,Organization Name,,
1,URL,Unique identifier for filing -- use for merging with filings-level datasets (also for seeing raw filing online),,
2,EIN,EIN,,
3,SJ_01_PC_BOARD_APPROVAL,Board or committee approval,SCHED-J-PART-01-LINE-3,CheckboxType
4,SJ_01_PC_CLUB_FEES,Club dues or fees,SCHED-J-PART-01-LINE-1a,CheckboxType
5,SJ_01_PC_COMPANION_TRAVEL,Travel for companions,SCHED-J-PART-01-LINE-1a,CheckboxType
6,SJ_01_PC_COMPENSATION_COMMITTEE,Compensation committee,SCHED-J-PART-01-LINE-3,CheckboxType
7,SJ_01_PC_COMPENSATION_SURVEY,Compensation survey,SCHED-J-PART-01-LINE-3,CheckboxType
8,SJ_01_PC_CONSULTANT,Independent consultant,SCHED-J-PART-01-LINE-3,CheckboxType
9,SJ_01_PC_CONTINGENT_NET_OWN,Compensation based on net earnings of filing org?,SCHED-J-PART-01-LINE-6a,BooleanType


In [32]:
set(new_variables_df['variable_name_new'].tolist()) - set(df.columns.tolist())

set()

In [33]:
set(df.columns.tolist()) - set(new_variables_df['variable_name_new'].tolist()) 

set()

In [34]:
print(len(new_variables_df))
new_variables_df[:1]

29


Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,OrganizationName,Organization Name,,


#### Save DF

In [37]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
new_variables_df.to_pickle('Codebook - Schedule J (Part I).pkl')

In [35]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
new_variables_df.to_excel('Codebook - Schedule J (Part I) v2.xlsx', index=False)

Current date and time :  2023-01-09 14:14:06 

Wall time: 708 ms
