# Overview
Create combined codebook and save files:
- *Codebook - Schedule J (Part II).pkl*
- *Codebook - Schedule J (Part II).xlsx*

In [8]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series

In [9]:
print(pd.__version__)

1.3.4


In [10]:
from platform import python_version
print(python_version())

3.8.12


In [11]:
# http://pandas.pydata.org/pandas-docs/stable/options.html
pd.set_option('display.max_columns', None)
# http://pandas.pydata.org/pandas-docs/stable/options.html
pd.set_option('display.max_colwidth', 500)

In [12]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [13]:
pd.options.display.float_format = '{:,.2f}'.format

#### Set working directory

In [14]:
cd "C:\\Users\\Gregory\\IRS 990 Control Variables\\"

C:\Users\Gregory\IRS 990 Control Variables


# Read in Control Variables

In [15]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
df = pd.read_pickle('Excise Tax Project - Schedule J Part II (PERSON-LEVEL DF) parsed.pkl.gz', compression='gzip')
print('# of columns:', len(df.columns))
print('# of observations:', len(df))
df[:1]

Current date and time :  2022-02-24 16:23:57 

# of columns: 19
# of observations: 1914476
Wall time: 2.99 s


Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,SJ_02_PC_TITLE,SJ_02_PC_COMP_BASE,SJ_02_PC_COMP_BASE_RELATED,SJ_02_PC_COMP_BONUS,SJ_02_PC_COMP_BONUS_RELATED,SJ_02_PC_COMP_OTHER,SJ_02_PC_COMP_OTHER_RELATED,SJ_02_PC_COMP_DEFERRED,SJ_02_PC_COMP_DEFERRED_RELATED,SJ_02_PC_NONTAXED_BENF,SJ_02_PC_NONTAXED_BENF_RELATED,SJ_02_PC_COMP_TOTAL,SJ_02_PC_COMP_TOTAL_RELATED,SJ_02_PC_COMP_DEF_PRIOR,SJ_02_PC_COMP_DEF_PRIOR_RELATED
0,https://s3.amazonaws.com/irs-form-990/201113139349301311_public.xml,THOMAS D TURNBULL,,,,,100712.0,,790.0,,1257.0,,54308.0,,62342.0,,219409.0,,


# Read in Concordance File

In [16]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
concordance = pd.read_excel('concordance - Schedule J Part II (VERIFIED).xlsx')
print('# of columns:', len(concordance.columns))
print('# of observations:', len(concordance))
concordance[:1]

Current date and time :  2022-02-24 16:24:03 

# of columns: 17
# of observations: 38
Wall time: 433 ms


Unnamed: 0,xpath,variable_name_new,# of Characters (newly named),variable name notes,PARSING NOTES,OTHER NOTES,description,location_code,part,data_type_xsd,MongoDB_Name,sub_key,sub_sub_key,xpath_top_full,xpath_top,xpath_top_len,xpath_second
0,/Return/ReturnData/IRS990ScheduleJ/Form990ScheduleJPartII/NamePerson,SJ_02_PC_NAME_OFF_TRST_KEYEMP,,,,,Name of officer - person,SCHED-J-PART-02-COL-A-(i),PART-02,PersonNameType,NamePerson,NamePerson,,Form990ScheduleJPartII/NamePerson,Form990ScheduleJPartII,2,


In [17]:
def agg_funcs(x):
    names = {
        'description': x['description'].head(1).values[0],
        'location_code': x['location_code'].head(1).values[0],        
        'data_type_xsd': x['data_type_xsd'].head(1).values[0],
        #'python_data_type': x['python_data_type'].head(1).values[0],
        #'PARSING NOTES': x['PARSING NOTES'].head(1).values[0],       
        #'OTHER NOTES': x['OTHER NOTES'].head(1).values[0],       
    }
    #THE FOLLOWING SHORTCUT WORKS BUT CHANGES THE ORDER OF THE COLUMNS
    #return pd.Series(names, index = list(names.keys()))
    return pd.Series(names, index=['description', 'location_code', 'data_type_xsd', #'python_data_type', 
                                  #'PARSING NOTES', 'OTHER NOTES'
                                  ])
new_variables_df = concordance.groupby(['variable_name_new']).apply(agg_funcs)
new_variables_df = new_variables_df.reset_index()
print('# of variables:', len(new_variables_df))
new_variables_df[:]

# of variables: 18


Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,SJ_02_PC_COMP_BASE,Base compensation ($) from filing organization,SCHED-J-PART-02-COL-B(i)-(i),USAmountType
1,SJ_02_PC_COMP_BASE_RELATED,Compensation based on related organizations?,SCHED-J-PART-02-COL-B(i)-(ii),USAmountType
2,SJ_02_PC_COMP_BONUS,Bonus and incentive compensation ($) from filing organization,SCHED-J-PART-02-COL-B(ii)-(i),USAmountType
3,SJ_02_PC_COMP_BONUS_RELATED,Bonus and incentive compensation ($) from related organizations,SCHED-J-PART-02-COL-B(ii)-(ii),USAmountType
4,SJ_02_PC_COMP_DEFERRED,Deferred compensation ($) from filing organization,SCHED-J-PART-02-COL-C-(i),USAmountType
5,SJ_02_PC_COMP_DEFERRED_RELATED,Deferred compensation ($) from related organizations,SCHED-J-PART-02-COL-C-(ii),USAmountType
6,SJ_02_PC_COMP_DEF_PRIOR,Comp reported prior 990 - from filing org,SCHED-J-PART-02-COL-F-(i),USAmountType
7,SJ_02_PC_COMP_DEF_PRIOR_RELATED,Comp reported prior 990 - from related orgs,SCHED-J-PART-02-COL-F-(ii),USAmountType
8,SJ_02_PC_COMP_OTHER,Other compensation ($) from filing organization,SCHED-J-PART-02-COL-B(iii)-(i),USAmountType
9,SJ_02_PC_COMP_OTHER_RELATED,Other compensation ($) from realted organizations,SCHED-J-PART-02-COL-B(iii)-(ii),USAmountType


In [18]:
concordance[concordance['description'].isnull()]

Unnamed: 0,xpath,variable_name_new,# of Characters (newly named),variable name notes,PARSING NOTES,OTHER NOTES,description,location_code,part,data_type_xsd,MongoDB_Name,sub_key,sub_sub_key,xpath_top_full,xpath_top,xpath_top_len,xpath_second


In [19]:
new_variables_df[new_variables_df['description'].isnull()]

Unnamed: 0,variable_name_new,description,location_code,data_type_xsd


In [20]:
for index, row in new_variables_df.iterrows():
    #if pd.isnull(row['python_data_type']):
    #    print('Missing *python_data_type*', row['variable_name_new'], row['python_data_type'], '\n')
    if pd.isnull(row['location_code']):
        print('Missing *location_code*',  row['variable_name_new'], '.....', row['location_code'], row['description'], '\n')         
    elif pd.isnull(row['data_type_xsd']):
        print('Missing *data_type_xsd*', row['variable_name_new'], row['data_type_xsd'], '\n')
    elif pd.isnull(row['description']):
        print('Missing *description*', row['variable_name_new'], '.....', row['description'])         

In [21]:
set(new_variables_df['variable_name_new'].tolist()) - set(df.columns.tolist())

set()

In [22]:
set(df.columns.tolist()) - set(new_variables_df['variable_name_new'].tolist())

{'URL'}

In [23]:
print(len(df))

1914476


In [24]:
no_conc = [ 'URL']
vals = ['Unique identifier for filing -- use for merging with filings-level datasets (also for seeing raw filing online)']

#list(zip(lst, lst2)),
no_concordance_df = pd.DataFrame(list(zip(no_conc,vals)), columns =['variable_name_new', 'description'])
no_concordance_df

Unnamed: 0,variable_name_new,description
0,URL,Unique identifier for filing -- use for merging with filings-level datasets (also for seeing raw filing online)


In [25]:
print(len(new_variables_df))
print(len(new_variables_df.append(no_concordance_df)))
new_variables_df = new_variables_df.append(no_concordance_df)
print(len(new_variables_df))

18
19
19


In [26]:
new_variables_df[new_variables_df['variable_name_new'].isin(no_conc)]

Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,URL,Unique identifier for filing -- use for merging with filings-level datasets (also for seeing raw filing online),,


### Inspect

In [27]:
new_variables_df[new_variables_df['description'].isnull()]

Unnamed: 0,variable_name_new,description,location_code,data_type_xsd


In [28]:
set(new_variables_df['variable_name_new'].tolist()) - set(df.columns.tolist())

set()

In [29]:
set(df.columns.tolist()) - set(new_variables_df['variable_name_new'].tolist())

set()

In [30]:
df[:1]

Unnamed: 0,URL,SJ_02_PC_NAME_OFF_TRST_KEYEMP,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,SJ_02_PC_TITLE,SJ_02_PC_COMP_BASE,SJ_02_PC_COMP_BASE_RELATED,SJ_02_PC_COMP_BONUS,SJ_02_PC_COMP_BONUS_RELATED,SJ_02_PC_COMP_OTHER,SJ_02_PC_COMP_OTHER_RELATED,SJ_02_PC_COMP_DEFERRED,SJ_02_PC_COMP_DEFERRED_RELATED,SJ_02_PC_NONTAXED_BENF,SJ_02_PC_NONTAXED_BENF_RELATED,SJ_02_PC_COMP_TOTAL,SJ_02_PC_COMP_TOTAL_RELATED,SJ_02_PC_COMP_DEF_PRIOR,SJ_02_PC_COMP_DEF_PRIOR_RELATED
0,https://s3.amazonaws.com/irs-form-990/201113139349301311_public.xml,THOMAS D TURNBULL,,,,,100712.0,,790.0,,1257.0,,54308.0,,62342.0,,219409.0,,


In [31]:
new_variables_df[:1]

Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,SJ_02_PC_COMP_BASE,Base compensation ($) from filing organization,SCHED-J-PART-02-COL-B(i)-(i),USAmountType


#### Reset Index

In [32]:
new_variables_df = new_variables_df.set_index('variable_name_new')
new_variables_df[:5]

Unnamed: 0_level_0,description,location_code,data_type_xsd
variable_name_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SJ_02_PC_COMP_BASE,Base compensation ($) from filing organization,SCHED-J-PART-02-COL-B(i)-(i),USAmountType
SJ_02_PC_COMP_BASE_RELATED,Compensation based on related organizations?,SCHED-J-PART-02-COL-B(i)-(ii),USAmountType
SJ_02_PC_COMP_BONUS,Bonus and incentive compensation ($) from filing organization,SCHED-J-PART-02-COL-B(ii)-(i),USAmountType
SJ_02_PC_COMP_BONUS_RELATED,Bonus and incentive compensation ($) from related organizations,SCHED-J-PART-02-COL-B(ii)-(ii),USAmountType
SJ_02_PC_COMP_DEFERRED,Deferred compensation ($) from filing organization,SCHED-J-PART-02-COL-C-(i),USAmountType


In [33]:
len(new_variables_df)

19

In [34]:
len(df.columns.tolist())

19

In [35]:
print(len(new_variables_df.reindex(df.columns.tolist())))
new_variables_df = new_variables_df.reindex(df.columns.tolist())
print(len(new_variables_df))

19
19


In [36]:
print(len(new_variables_df.reindex(df.columns.tolist())))
new_variables_df = new_variables_df.reset_index()
print(len(new_variables_df))
new_variables_df[:1]

19
19


Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,URL,Unique identifier for filing -- use for merging with filings-level datasets (also for seeing raw filing online),,


In [37]:
new_variables_df

Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,URL,Unique identifier for filing -- use for merging with filings-level datasets (also for seeing raw filing online),,
1,SJ_02_PC_NAME_OFF_TRST_KEYEMP,Name of officer - person,SCHED-J-PART-02-COL-A-(i),PersonNameType
2,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L1,Name Business - BusinessNameLine1,SCHED-J-PART-02-COL-A-(i),BusinessNameLine1Type
3,SJ_02_PC_NAME_OFF_TRST_KEYEMP_L2,Name Business - BusinessNameLine2,SCHED-J-PART-02-COL-A-(ii),BusinessNameLine2Type
4,SJ_02_PC_TITLE,Form990 Schedule JPart II - Title,SCHED-J-PART-02-COL-A-(ii),LineExplanationType
5,SJ_02_PC_COMP_BASE,Base compensation ($) from filing organization,SCHED-J-PART-02-COL-B(i)-(i),USAmountType
6,SJ_02_PC_COMP_BASE_RELATED,Compensation based on related organizations?,SCHED-J-PART-02-COL-B(i)-(ii),USAmountType
7,SJ_02_PC_COMP_BONUS,Bonus and incentive compensation ($) from filing organization,SCHED-J-PART-02-COL-B(ii)-(i),USAmountType
8,SJ_02_PC_COMP_BONUS_RELATED,Bonus and incentive compensation ($) from related organizations,SCHED-J-PART-02-COL-B(ii)-(ii),USAmountType
9,SJ_02_PC_COMP_OTHER,Other compensation ($) from filing organization,SCHED-J-PART-02-COL-B(iii)-(i),USAmountType


In [38]:
set(new_variables_df['variable_name_new'].tolist()) - set(df.columns.tolist())

set()

In [39]:
set(df.columns.tolist()) - set(new_variables_df['variable_name_new'].tolist()) 

set()

In [40]:
print(len(new_variables_df))
new_variables_df[:1]

19


Unnamed: 0,variable_name_new,description,location_code,data_type_xsd
0,URL,Unique identifier for filing -- use for merging with filings-level datasets (also for seeing raw filing online),,


#### Save DF

In [41]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
new_variables_df.to_pickle('Codebook - Schedule J (Part II).pkl')

Current date and time :  2022-02-24 16:24:28 

Wall time: 2.99 ms


In [37]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
new_variables_df.to_excel('Codebook - Schedule J (Part II).xlsx', index=False)