In [251]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

from sklearn.preprocessing import MinMaxScaler, RobustScaler, Normalizer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import os
import conda

import locale
from datetime import datetime

locale.setlocale(locale.LC_ALL, 'en_US.UTF8')

'en_US.UTF8'

In [252]:
pd.set_option('mode.chained_assignment','raise')
pd.options.display.max_rows = 4000
pd.options.display.max_columns = 4000

# About NIH awards data

NIH data was imported from [FY 2019 RePORTER Project Data](https://exporter.nih.gov/CSVs/final/RePORTER_PRJ_C_FY2019.zip) and the report listing page [exporter.nih.gov](https://exporter.nih.gov/ExPORTER_Catalog.aspx?sid=5&index=0). An [explanation of the fields](https://exporter.nih.gov/about.aspx) is helpful for analysis.

# Project structure

```bash session
├── NIH-awards.ipynb       # this jupyter notebook
├── data
│   ├── nih                # holds all exporter csv files
│   └── zips               # us-zip-code-latitude-and-longitude.csv
├── out
│   ├── csv                # any csv output
│   ├── json               # any json output
│   └── models             # any saved models
└── utils
    ├── get_csvs           # script to download all the exporter csv files (requires curl)
    └── reporter_files.txt # list of exporter files to get
```

# Import the data

In [253]:
# Run the get_csvs utility to download files into the `data` from exporter.nih.gov
# The file ./utils/reporter_files.txt contains a list of all files to get

# date_downloaded = !date -Is
# ! ./utils/get_csvs
# date_downloaded

In [265]:
files = !find ./data/exporter/ -name "*RePORTER_PRJ*" -exec echo "{}" \;
files = list(set(files) - set(['./data/exporter/RePORTER_PRJ_C_FY2018.csv', './data/exporter/RePORTER_PRJ_C_FY2017.csv', './data/exporter/RePORTER_PRJ_C_FY2016.csv']))
files.sort()
len(files)

34

In [266]:
all_cols = !head -n 1 './data/exporter/RePORTER_PRJ_C_FY2019.csv'
all_cols = all_cols.s.split(',')

In [267]:
# ignore these columns
ignore = ['ARRA_FUNDED', 'FOA_NUMBER', 'ED_INST_TYPE', 'ORG_FIPS', 'ORG_DUNS', 'ORG_IPF_CODE', 'CFDA_CODE']

In [268]:
cols = list(set(all_cols) - set(ignore))

In [316]:
def create_dataframe(name):
    '''
    Return a DataFrame with an added column `fromfile` to denote
    which file the data came from
    '''
    head, tail = os.path.split(name)
    _df = pd.read_csv(name, dtype={ 'ORG_ZIPCODE': 'str'}, usecols=cols, parse_dates=["AWARD_NOTICE_DATE","BUDGET_START","BUDGET_END"], encoding="ISO-8859-1")
    _df['fromfile'] = tail
    return _df

In [317]:
%%time
date_imported = !date -Is
v = [create_dataframe(name) for name in files]
date_imported

CPU times: user 4.54 s, sys: 588 ms, total: 5.13 s
Wall time: 6.72 s


['2020-05-26T18:50:21+00:00']

### Concat all DataFrames

In [402]:
%%time
df = pd.concat(v,ignore_index=True)

CPU times: user 419 ms, sys: 2.86 ms, total: 422 ms
Wall time: 430 ms


### Convert types

In [403]:
df["PROJECT_START"] = pd.to_datetime(df["PROJECT_START"], errors='coerce')
df["PROJECT_END"] = pd.to_datetime(df["PROJECT_END"], errors='coerce')
df["BUDGET_START"] = pd.to_datetime(df["BUDGET_START"], errors='coerce')
df["BUDGET_END"] = pd.to_datetime(df["BUDGET_END"], errors='coerce')
df['PROJECT_TERMS'] = df['PROJECT_TERMS'].astype(pd.StringDtype())
df['PI_NAMEs'] = df['PI_NAMEs'].astype(pd.StringDtype())

# Drop rows

Some rows will not be very useful for analysis if they:

- ~do not have associated start/end metrics~ Initially, we dropped these columns, under closer examination, their total costs were significant (over 6B). So, we will keep them, but will be unable to plot them in any time-series format.
- do not have associated costs
- are duplicates

In [404]:
before = len(df)
before

109784

In [405]:
# Keep only the rows with at least 2 non-NA values.
# thresh = 2
# df.dropna(subset=['AWARD_NOTICE_DATE', 'BUDGET_START', 'BUDGET_END'], thresh=thresh, how='all', inplace=True)

In [406]:
# num_dropped = before - len(df)
# dropped_pct = "{:.3%}".format(num_dropped / before)
# display(HTML(f'<h3>Dropped {num_dropped} out of {before} rows or <strong>{dropped_pct}</strong></h3>'))

In [407]:
# Keep only the rows with at least 1 non-NA values.
thresh = 1
df.dropna(subset=['DIRECT_COST_AMT', 'INDIRECT_COST_AMT', 'TOTAL_COST', 'TOTAL_COST_SUB_PROJECT'], thresh=thresh, how='all', inplace=True)

In [408]:
len(df)

106685

In [409]:
num_dropped = before - len(df)
dropped_pct = "{:.3%}".format(num_dropped / before)
display(HTML(f'<h3>Dropped {num_dropped} out of {before} rows or <strong>{dropped_pct}</strong></h3>'))

In [410]:
# Drop duplicate applications
# Duplicate application ids typically have the same application type, dates and cost amounts

print(f'Dropping {len(df.loc[df.duplicated("APPLICATION_ID")])} awards with duplicate application id')
df.drop_duplicates(subset=['APPLICATION_ID'], keep='last', inplace=True)

Dropping 670 awards with duplicate application id


In [411]:
from IPython.display import HTML

In [412]:
num_dropped = before - len(df)
dropped_pct = "{:.3%}".format(num_dropped / before)
display(HTML(f'<h3>Dropped {num_dropped} out of {before} rows or <strong>{dropped_pct}</strong></h3>'))

# Create a new dataframe limited to awards whose Administering Institute or Center is an NIH Agency and whose organization country is US.

Since the vast majority of awards in this dataset are administered through NIH agencies to organizations whose office is located in the US, lets restrict our analysis to a subset. 

In [413]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [414]:
nih_agencies = [{"code":"AA","name":"NIH National Institute on Alcohol Abuse and Alcoholism (NIAAA)"},{"code":"OD","name":"Office of the Director, NIH"},{"code":"AG","name":"NIH National Institute on Aging (NIA)"},{"code":"AI","name":"NIH National Institute of Allergy and Infectious Diseases (NIAID)"},{"code":"AR","name":"NIH National Institute of Arthritis and Musculoskeletal and Skin Diseases (NIAMS)"},{"code":"AT","name":"NIH National Center for Complementary and Integrative Health (NCCIH)"},{"code":"CA","name":"NIH National Cancer Institute (NCI)"},{"code":"DA","name":"NIH National Institute on Drug Abuse (NIDA)"},{"code":"DC","name":"NIH National Institute on Deafness and Other Communication Disorders (NIDCD)"},{"code":"DE","name":"NIH National Institute of Dental & Craniofacial Research (NIDCR)"},{"code":"DK","name":"NIH National Institute of Diabetes and Digestive and Kidney Diseases (NIDDK)"},{"code":"EB","name":"NIH National Institute of Biomedical Imaging and Bioengineering (NIBIB)"},{"code":"ES","name":"NIH National Institute of Environmental Health Sciences (NIEHS)"},{"code":"EY","name":"NIH National Eye Institute (NEI)"},{"code":"GM","name":"NIH National Institute of General Medical Sciences (NIGMS)"},{"code":"IHS","name":"Indian Health Service"},{"code":"HD","name":"NIH Eunice Kennedy Shriver National Institute of Child Health and Human Development (NICHD)"},{"code":"HG","name":"NIH National Human Genome Research Institute (NHGRI)"},{"code":"HL","name":"NIH National Heart, Lung and Blood Institute (NHLBI)"},{"code":"LM","name":"NIH National Library of Medicine (NLM)"},{"code":"MD","name":"NIH National Institute on Minority Health and Health Disparities (NIMHD)"},{"code":"MH","name":"NIH National Institute of Mental Health (NIMH)"},{"code":"NR","name":"NIH National Institute of Nursing Research (NINR)"},{"code":"NS","name":"NIH National Institute of Neurological Disorders and Stroke (NINDS)"},{"code":"RM","name":"NIH Roadmap"},{"code":"RR","name":"National Center for Research Resources (NCRR) (dissolved 12/2011)"},{"code":"TR","name":"NIH National Center for Advancing translational Sciences (NCATS)"},{"code":"TW","name":"NIH Fogarty International Center (FIC)"}]
codes_list = [obj['code'] for obj in nih_agencies]
is_nih = df['ADMINISTERING_IC'].isin(codes_list)
is_us = df['ORG_COUNTRY'] == 'UNITED STATES'

In [415]:
nih_awards_pct = "{:.1%}".format(len(df.loc[is_nih]) / len(df))
display(HTML(f'<h3>{nih_awards_pct} of awards list an NIH agency as `ADMINISTERING_IC`</h3>'))

In [416]:
us_awards_pct = "{:.1%}".format(len(df.loc[is_us]) / len(df))
display(HTML(f'<h3>{us_awards_pct} of awards list the US as `ORG_COUNTRY`</h3><p>The NIH defines ORG_COUNTRY as <em>The country in which the business office of the grantee organization or contractor is located.  Note that this may be different from the research performance site</em>.<p>'))

In [417]:
df_us = df.loc[(is_nih) & (is_us)]

In [438]:
len(df)

106015

In [439]:
len(df_us)

100175

# Clean up some badly formatted zip codes

To properly map coordinates to zipcodes, zips must be in the correct format. The NIH dataset often has 

In [418]:
us_zips_non_dig = df_us['ORG_ZIPCODE'].str.contains('\D', na=False, regex=True)

In [419]:
df_us = df_us.copy()

In [420]:
df_us.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100175 entries, 0 to 109783
Data columns (total 40 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   APPLICATION_ID          100175 non-null  int64         
 1   ACTIVITY                100175 non-null  object        
 2   ADMINISTERING_IC        100175 non-null  object        
 3   APPLICATION_TYPE        98875 non-null   float64       
 4   AWARD_NOTICE_DATE       98848 non-null   object        
 5   BUDGET_START            98874 non-null   datetime64[ns]
 6   BUDGET_END              98809 non-null   datetime64[ns]
 7   CORE_PROJECT_NUM        98875 non-null   object        
 8   FULL_PROJECT_NUM        100175 non-null  object        
 9   FUNDING_ICs             100175 non-null  object        
 10  FUNDING_MECHANISM       100175 non-null  object        
 11  FY                      100175 non-null  int64         
 12  IC_NAME                 100175

In [421]:
df_us.loc[df_us['APPLICATION_ID'].isin(['9125677', '8837925', '9330009', '9837979', '9543896', '9994542']), 'ORG_ZIPCODE'] = '49418'
df_us.loc[df_us['APPLICATION_ID'].isin(['9686592', '9765856', '9806287', '9724114', '10044151', '10044353', '9916136', '9980531']), 'ORG_ZIPCODE'] = '98052'
df_us.loc[df_us['APPLICATION_ID'].isin(['9677029', '9804996', '9929994', '10055828']), 'ORG_ZIPCODE'] = '20170'
df_us.loc[df_us['APPLICATION_ID'].isin(['9805650', '10030922']), 'ORG_ZIPCODE'] = '80303'
df_us.loc[df_us['APPLICATION_ID'].isin(['9810864']), 'ORG_ZIPCODE'] = '21228'

In [422]:
len(df_us.loc[(df_us['ORG_ZIPCODE'].str.contains('\D', na=False, regex=True)) & (df_us['ORG_COUNTRY'] == 'UNITED STATES')]) == 0

True

### Zip code leading zeroes should be in place

Pad zeros on zips < 5 and > 9 and just take the first 5 chars

In [423]:
is_len_less_five = df_us['ORG_ZIPCODE'].str.len() < 5
print(f'DataFrame contained {len(df_us.loc[is_len_less_five])} rows with zipcodes < 5 digits')
df_us.loc[is_len_less_five, 'ORG_ZIPCODE'] = [s.rjust(5, '0')[:5] for idx, s in df_us.loc[is_len_less_five, 'ORG_ZIPCODE'].items()]
is_len_less_five = df_us['ORG_ZIPCODE'].str.len() < 5
df_us.loc[is_len_less_five].size == 0

DataFrame contained 75 rows with zipcodes < 5 digits


True

In [424]:
is_len_gt_five = df_us['ORG_ZIPCODE'].str.len() > 5
is_len_lt_nine = df_us['ORG_ZIPCODE'].str.len() < 9
print(f'DataFrame contained {len(df_us.loc[is_len_gt_five & is_len_lt_nine])} rows with zipcodes > 5 and < 9 digits')
df_us.loc[(is_len_gt_five), 'ORG_ZIPCODE'] = [s.rjust(9, '0')[:5] for idx, s in df_us.loc[(is_len_gt_five), 'ORG_ZIPCODE'].items()]
is_len_gt_five = df_us['ORG_ZIPCODE'].str.len() > 5
df_us.loc[(is_len_gt_five)].size == 0

DataFrame contained 10199 rows with zipcodes > 5 and < 9 digits


True

## Import ZIP to GEO data

The "[US Zip Code Latitude and Longitude](https://public.opendatasoft.com/explore/dataset/us-zip-code-latitude-and-longitude/information/)" by [CivicSpace Labs]() is licensed under [Creative Commons Attribution-ShareAlike](https://creativecommons.org/licenses/by-sa/2.0/). Copyright 2004 CivicSpace Labs.

In [425]:
%%time
date_zip_imported = !date -Is
us_zip_code_latitude_and_longitude = './data/zips/us-zip-code-latitude-and-longitude.csv'
column_names=["Zip","City","State","Latitude","Longitude","Timezone","Daylight savings time flag","geopoint"]
zip_to_latlong = pd.read_csv(us_zip_code_latitude_and_longitude, sep=';', dtype={'Zip': 'str'}, header=0, names=column_names, encoding="ISO-8859-1")
date_zip_imported

CPU times: user 513 ms, sys: 89.9 ms, total: 602 ms
Wall time: 674 ms


['2020-05-26T19:18:19+00:00']

### Add some missing values to `zip_to_latlong`

We can eventually just save this to an updated file

In [426]:
zip_cols = zip_to_latlong.columns
listOfSeries = [pd.Series(['94158', None, None, 37.77244949, -122.39166260, None, None, None], index=zip_cols),
                pd.Series(['92617', None, None, 33.63830185, -117.84275055, None, None, None], index=zip_cols),
                pd.Series(['10065', None, None, 40.76429569, -73.96246150, None, None, None], index=zip_cols),
                pd.Series(['18902', None, None, 40.37361908, -75.06803894, None, None, None], index=zip_cols),
                pd.Series(['62712', None, None, 39.759095, -89.581855, None, None, None], index=zip_cols),
                pd.Series(['27268', None, None, 35.971691, -79.995012, None, None, None], index=zip_cols),
                pd.Series(['95757', None, None, 38.388294, -121.438706, None, None, None], index=zip_cols),
                pd.Series(['92011', None, None, 33.104738, -117.294838, None, None, None], index=zip_cols),
                pd.Series(['28035', None, None, 35.500264, -80.844537, None, None, None], index=zip_cols),
                pd.Series(['48193', None, None, 42.176885, -83.176072, None, None, None], index=zip_cols),
                pd.Series(['60491', None, None, 41.608073, -87.964632, None, None, None], index=zip_cols),
                pd.Series(['85142', None, None, 33.197122, -111.638108, None, None, None], index=zip_cols),
                pd.Series(['85209', None, None, 33.396080, -111.650097, None, None, None], index=zip_cols),
                pd.Series(['85755', None, None, 32.463827, -110.982601, None, None, None], index=zip_cols),
                pd.Series(['60642', None, None, 41.899644, -87.657551, None, None, None], index=zip_cols),
                pd.Series(['80113', None, None, 39.652369, -104.976232, None, None, None], index=zip_cols),
                pd.Series(['62711', None, None, 39.792576, -89.662249, None, None, None], index=zip_cols),
                pd.Series(['92010', None, None, 33.160609, -117.293587, None, None, None], index=zip_cols),
                pd.Series(['84096', None, None, 40.473425, -112.069600, None, None, None], index=zip_cols),
                pd.Series(['96913', None, None, 13.4686, 144.7989, None, None, None], index=zip_cols)]

In [427]:
zip_df = zip_to_latlong.append(listOfSeries, ignore_index=True)

In [428]:
df_us = df_us.join(zip_df.loc[:, ['Zip', 'Latitude', 'Longitude']].set_index('Zip'), on='ORG_ZIPCODE')

In [429]:
len(df_us.loc[(pd.isna(df_us['Latitude'])) & (pd.isna(df_us['ORG_ZIPCODE'])), ['ORG_ZIPCODE', 'APPLICATION_ID']]) < 10

True

### Fix some mispellings of cities

In [430]:
df_us.loc[df_us['ORG_CITY'] == 'san  francisco', 'ORG_CITY'] = 'san francisco'
df_us.loc[df_us['ORG_CITY'] == 'winston salem', 'ORG_CITY'] = 'winston-salem'
df_us.loc[df_us['ORG_CITY'] == 'st. louis', 'ORG_CITY'] = 'saint louis'
df_us.loc[df_us['ORG_CITY'] == 'st. paul', 'ORG_CITY'] = 'saint paul'
df_us.loc[df_us['ORG_CITY'] == 'res triangle', 'ORG_CITY'] = 'research triangle park'

# Export Data

In [431]:
%%time
date_exported = !date -Is
OUT = f'./out/csv/post_processed_{date_exported[0]}.csv.gzip'
df_us.to_csv(OUT, index=False, compression='gzip')

CPU times: user 32.8 s, sys: 552 ms, total: 33.3 s
Wall time: 35.7 s


In [432]:
df_us.head()

Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CORE_PROJECT_NUM,FULL_PROJECT_NUM,FUNDING_ICs,FUNDING_MECHANISM,FY,IC_NAME,NIH_SPENDING_CATS,ORG_CITY,ORG_COUNTRY,ORG_DEPT,ORG_DISTRICT,ORG_NAME,ORG_STATE,ORG_ZIPCODE,PHR,PI_IDS,PI_NAMEs,PROGRAM_OFFICER_NAME,PROJECT_START,PROJECT_END,PROJECT_TERMS,PROJECT_TITLE,SERIAL_NUMBER,STUDY_SECTION,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT,fromfile,Latitude,Longitude
0,9787536,R01,GM,5.0,8/30/2019,2019-09-01,2020-08-31,R01GM127366,5R01GM127366-02,NIGMS:302774\,Non-SBIR/STTR RPGs,2019,NATIONAL INSTITUTE OF GENERAL MEDICAL SCIENCES,,CHICAGO,UNITED STATES,BIOLOGY,1.0,UNIVERSITY OF CHICAGO,IL,60637,NARRATIVE (Public Relevance Statement) This pr...,7601785;,"SCHMIDT-OTT, URS C;","HOODBHOY, TANYA",2018-09-18,2020-08-31,Affect; Animal Model; Animals; Anterior; Bindi...,Genetic variation and function of body axis de...,127366.0,DEV2,Development - 2 Study Section,,,2.0,190000.0,112774.0,302774.0,,RePORTER_PRJ_C_FY2019.csv,41.779384,-87.60544
1,9712520,R21,MH,1.0,4/1/2019,2019-04-01,2020-02-29,R21MH119253,1R21MH119253-01,NIMH:195625\,Non-SBIR/STTR RPGs,2019,NATIONAL INSTITUTE OF MENTAL HEALTH,,AUSTIN,UNITED STATES,PSYCHOLOGY,10.0,"UNIVERSITY OF TEXAS, AUSTIN",TX,78759,Public Health Relevance Temporally controlled ...,7031347;,"MORIKAWA, HITOSHI ;","NADLER, LAURIE S",2019-04-01,2021-02-28,Action Potentials; addiction; Address; ADORA2A...,Dopamine Timing-Dependent Plasticity in Reward...,119253.0,LAM,Neurobiology of Learning and Memory Study Sect...,,,1.0,125000.0,70625.0,195625.0,,RePORTER_PRJ_C_FY2019.csv,30.406169,-97.75743
2,9571523,T32,GM,2.0,6/14/2019,2019-07-01,2020-06-30,T32GM008268,2T32GM008268-31,NIGMS:426323\,"TRAINING, INSTITUTIONAL",2019,NATIONAL INSTITUTE OF GENERAL MEDICAL SCIENCES,,SEATTLE,UNITED STATES,BIOCHEMISTRY,7.0,UNIVERSITY OF WASHINGTON,WA,98195,Project Narrative The Molecular Biophysics Tr...,1879130; 7744794 (contact);,"KLEVIT, RACHEL E; ZHENG, NING (contact);","FLICKER, PAULA F",1988-09-30,2024-06-30,Biophysics; Molecular; Training Programs;,Molecular Biophysics Training Program,8268.0,TWD,NIGMS Initial Review Group,,,31.0,402870.0,23453.0,426323.0,,RePORTER_PRJ_C_FY2019.csv,47.432251,-121.803388
4,9790251,R01,GM,1.0,8/30/2019,2019-09-01,2020-08-31,R01GM136654,1R01GM136654-01,OD:666743\,Non-SBIR/STTR RPGs,2019,NATIONAL INSTITUTE OF GENERAL MEDICAL SCIENCES,,NEW YORK,UNITED STATES,INTERNAL MEDICINE/MEDICINE,12.0,ROCKEFELLER UNIVERSITY,NY,10065,Narrative The proposal will develop a new tech...,1862554;,"CHAIT, BRIAN T;","SMITH, WARD",2019-09-01,2024-08-31,Address; applied biomedical research; Area; at...,Development of Next Generation Mass Spectromet...,136654.0,ZRG1,Special Emphasis Panel,,,1.0,395819.0,270924.0,666743.0,,RePORTER_PRJ_C_FY2019.csv,40.764296,-73.962462
5,9787544,U01,HG,5.0,7/23/2019,2019-07-01,2020-06-30,U01HG010248,5U01HG010248-02,NHGRI:322003\,Non-SBIR/STTR RPGs,2019,NATIONAL HUMAN GENOME RESEARCH INSTITUTE,,NEW YORK,UNITED STATES,INTERNAL MEDICINE/MEDICINE,13.0,ICAHN SCHOOL OF MEDICINE AT MOUNT SINAI,NY,10029,Our stakeholder-engaged team is experienced in...,6724273;,"HOROWITZ, CAROL R.;","VOLPI, SIMONA",2018-09-18,2023-06-30,Address; Adult; Advocate; Affect; African; APO...,GeNYC: Genomic Implementation Research in the ...,10248.0,ZHG1,Special Emphasis Panel,,,2.0,205982.0,116021.0,322003.0,,RePORTER_PRJ_C_FY2019.csv,40.791586,-73.94575


In [433]:
OUT

'./out/csv/post_processed_2020-05-26T19:18:25+00:00.csv.gzip'

### Create a dtype dictionary for read_csv

Issue: Pandas gives warning: `DtypeWarning: Columns (13) have mixed types.Specify dtype option on import`. Pandas is trying to infer too many dtypes and running out of memory. 

[Solution on stackoverflow]() is to create a dataframe from the `dtypes` property and then change the type with `astype` to str (e.g., `dtype('int64') -> 'int64'`)

In [434]:
all_cols = set(df_us.columns)

In [435]:
dates = set(["AWARD_NOTICE_DATE","BUDGET_START","BUDGET_END", "PROJECT_START", "PROJECT_END"])

In [436]:
cols = list(all_cols - dates)

In [437]:
df_us.loc[:, cols].dtypes.apply(lambda x: x.name).to_dict()

{'ORG_CITY': 'object',
 'DIRECT_COST_AMT': 'float64',
 'fromfile': 'object',
 'ACTIVITY': 'object',
 'Latitude': 'float64',
 'CORE_PROJECT_NUM': 'object',
 'FUNDING_MECHANISM': 'object',
 'PI_IDS': 'object',
 'Longitude': 'float64',
 'ORG_STATE': 'object',
 'ADMINISTERING_IC': 'object',
 'STUDY_SECTION_NAME': 'object',
 'SERIAL_NUMBER': 'float64',
 'INDIRECT_COST_AMT': 'float64',
 'PHR': 'object',
 'TOTAL_COST': 'float64',
 'PROGRAM_OFFICER_NAME': 'object',
 'ORG_COUNTRY': 'object',
 'ORG_DEPT': 'object',
 'SUBPROJECT_ID': 'float64',
 'TOTAL_COST_SUB_PROJECT': 'float64',
 'IC_NAME': 'object',
 'ORG_ZIPCODE': 'object',
 'SUFFIX': 'object',
 'FULL_PROJECT_NUM': 'object',
 'NIH_SPENDING_CATS': 'object',
 'PROJECT_TITLE': 'object',
 'ORG_DISTRICT': 'float64',
 'APPLICATION_TYPE': 'float64',
 'PI_NAMEs': 'string',
 'APPLICATION_ID': 'int64',
 'FY': 'int64',
 'SUPPORT_YEAR': 'float64',
 'PROJECT_TERMS': 'string',
 'FUNDING_ICs': 'object',
 'STUDY_SECTION': 'object',
 'ORG_NAME': 'object'}