In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

from sklearn.preprocessing import MinMaxScaler, RobustScaler, Normalizer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import os
import conda

import locale
from datetime import datetime

locale.setlocale(locale.LC_ALL, 'en_US.UTF8')

'en_US.UTF8'

In [2]:
pd.set_option('mode.chained_assignment','raise')
pd.options.display.max_rows = 4000
pd.options.display.max_columns = 4000

# About NIH awards data

NIH data was imported from [FY 2019 RePORTER Project Data](https://exporter.nih.gov/CSVs/final/RePORTER_PRJ_C_FY2019.zip) and the report listing page [exporter.nih.gov](https://exporter.nih.gov/ExPORTER_Catalog.aspx?sid=5&index=0). An [explanation of the fields](https://exporter.nih.gov/about.aspx) is helpful for analysis.

# Project structure

```bash session
├── NIH-awards.ipynb       # this jupyter notebook
├── data
│   ├── nih                # holds all exporter csv files
│   └── zips               # us-zip-code-latitude-and-longitude.csv
├── out
│   ├── csv                # any csv output
│   ├── json               # any json output
│   └── models             # any saved models
└── utils
    ├── get_csvs           # script to download all the exporter csv files (requires curl)
    └── reporter_files.txt # list of exporter files to get
```

# Import the data

In [3]:
# Run the get_csvs utility to download files into the `data` from exporter.nih.gov
# The file ./utils/reporter_files.txt contains a list of all files to get

# date_downloaded = !date -Is
# ! ./utils/get_csvs
# date_downloaded

In [212]:
files = !find ./data/exporter/ -name "*RePORTER_PRJ*" -exec echo "{}" \;
files = list(set(files) - set(['./data/exporter/RePORTER_PRJ_C_FY2017.csv', './data/exporter/RePORTER_PRJ_C_FY2016.csv']))
files.sort()
len(files)

35

In [213]:
all_cols = !head -n 1 './data/exporter/RePORTER_PRJ_C_FY2019.csv'
all_cols = all_cols.s.split(',')

In [214]:
# ignore these columns
ignore = ['ARRA_FUNDED', 'FOA_NUMBER', 'ED_INST_TYPE', 'ORG_FIPS', 'ORG_DUNS', 'ORG_IPF_CODE', 'CFDA_CODE']

In [215]:
cols = list(set(all_cols) - set(ignore))

In [216]:
def create_dataframe(name):
    '''
    Return a DataFrame with an added column `fromfile` to denote
    which file the data came from
    '''
    head, tail = os.path.split(name)
    _df = pd.read_csv(name, dtype={ 'ORG_ZIPCODE': 'str'}, usecols=cols, parse_dates=["AWARD_NOTICE_DATE","BUDGET_START","BUDGET_END"], encoding="ISO-8859-1")
    _df['fromfile'] = tail
    return _df

In [217]:
%%time
date_imported = !date -Is
v = [create_dataframe(name) for name in files]
date_imported

CPU times: user 9.15 s, sys: 2.57 s, total: 11.7 s
Wall time: 14.9 s


['2020-05-21T14:11:17+00:00']

### Concat all DataFrames

In [218]:
%%time
df = pd.concat(v,ignore_index=True)

CPU times: user 2.1 s, sys: 142 ms, total: 2.24 s
Wall time: 2.26 s


### Convert types

In [219]:
df["PROJECT_START"] = pd.to_datetime(df["PROJECT_START"], errors='coerce')
df["PROJECT_END"] = pd.to_datetime(df["PROJECT_END"], errors='coerce')
df["BUDGET_START"] = pd.to_datetime(df["BUDGET_START"], errors='coerce')
df["BUDGET_END"] = pd.to_datetime(df["BUDGET_END"], errors='coerce')
df['PROJECT_TERMS'] = df['PROJECT_TERMS'].astype(pd.StringDtype())
df['PI_NAMEs'] = df['PI_NAMEs'].astype(pd.StringDtype())

# Drop rows

Some rows will not be very useful for analysis if they:

- do not have associated start/end metrics
- do not have associated costs
- are duplicates

In [220]:
before = len(df)

In [221]:
# Keep only the rows with at least 2 non-NA values.
thresh = 2
df.dropna(subset=['AWARD_NOTICE_DATE', 'BUDGET_START', 'BUDGET_END'], thresh=thresh, how='all', inplace=True)

In [222]:
# Keep only the rows with at least 1 non-NA values.
thresh = 1
df.dropna(subset=['DIRECT_COST_AMT', 'INDIRECT_COST_AMT', 'TOTAL_COST', 'TOTAL_COST_SUB_PROJECT'], thresh=thresh, how='all', inplace=True)

In [223]:
# Drop duplicate applications
# Duplicate application ids typically have the same application type, dates and cost amounts

print(f'Dropping {len(df.loc[df.duplicated("APPLICATION_ID")])} awards with duplicate application id')
df.drop_duplicates(subset=['APPLICATION_ID'], keep='last', inplace=True)

Dropping 681 awards with duplicate application id


In [224]:
from IPython.display import HTML

In [225]:
num_dropped = before - len(df)
dropped_pct = "{:.3%}".format(num_dropped / before)
display(HTML(f'<h3>Dropped {num_dropped} out of {before} rows or <strong>{dropped_pct}</strong></h3>'))

# Create a new dataframe limited to awards whose Administering Institute or Center is an NIH Agency and whose organization country is US.

Since the vast majority of awards in this dataset are administered through NIH agencies to organizations whose office is located in the US, lets restrict our analysis to a subset. 

In [226]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [227]:
nih_agencies = [{"code":"AA","name":"NIH National Institute on Alcohol Abuse and Alcoholism (NIAAA)"},{"code":"OD","name":"Office of the Director, NIH"},{"code":"AG","name":"NIH National Institute on Aging (NIA)"},{"code":"AI","name":"NIH National Institute of Allergy and Infectious Diseases (NIAID)"},{"code":"AR","name":"NIH National Institute of Arthritis and Musculoskeletal and Skin Diseases (NIAMS)"},{"code":"AT","name":"NIH National Center for Complementary and Integrative Health (NCCIH)"},{"code":"CA","name":"NIH National Cancer Institute (NCI)"},{"code":"DA","name":"NIH National Institute on Drug Abuse (NIDA)"},{"code":"DC","name":"NIH National Institute on Deafness and Other Communication Disorders (NIDCD)"},{"code":"DE","name":"NIH National Institute of Dental & Craniofacial Research (NIDCR)"},{"code":"DK","name":"NIH National Institute of Diabetes and Digestive and Kidney Diseases (NIDDK)"},{"code":"EB","name":"NIH National Institute of Biomedical Imaging and Bioengineering (NIBIB)"},{"code":"ES","name":"NIH National Institute of Environmental Health Sciences (NIEHS)"},{"code":"EY","name":"NIH National Eye Institute (NEI)"},{"code":"GM","name":"NIH National Institute of General Medical Sciences (NIGMS)"},{"code":"IHS","name":"Indian Health Service"},{"code":"HD","name":"NIH Eunice Kennedy Shriver National Institute of Child Health and Human Development (NICHD)"},{"code":"HG","name":"NIH National Human Genome Research Institute (NHGRI)"},{"code":"HL","name":"NIH National Heart, Lung and Blood Institute (NHLBI)"},{"code":"LM","name":"NIH National Library of Medicine (NLM)"},{"code":"MD","name":"NIH National Institute on Minority Health and Health Disparities (NIMHD)"},{"code":"MH","name":"NIH National Institute of Mental Health (NIMH)"},{"code":"NR","name":"NIH National Institute of Nursing Research (NINR)"},{"code":"NS","name":"NIH National Institute of Neurological Disorders and Stroke (NINDS)"},{"code":"RM","name":"NIH Roadmap"},{"code":"RR","name":"National Center for Research Resources (NCRR) (dissolved 12/2011)"},{"code":"TR","name":"NIH National Center for Advancing translational Sciences (NCATS)"},{"code":"TW","name":"NIH Fogarty International Center (FIC)"}]
codes_list = [obj['code'] for obj in nih_agencies]
is_nih = df['ADMINISTERING_IC'].isin(codes_list)
is_us = df['ORG_COUNTRY'] == 'UNITED STATES'

In [228]:
nih_awards_pct = "{:.1%}".format(len(df.loc[is_nih]) / len(df))
display(HTML(f'<h3>{nih_awards_pct} of awards list an NIH agency as `ADMINISTERING_IC`</h3>'))

In [229]:
us_awards_pct = "{:.1%}".format(len(df.loc[is_us]) / len(df))
display(HTML(f'<h3>{us_awards_pct} of awards list the US as `ORG_COUNTRY`</h3><p>The NIH defines ORG_COUNTRY as <em>The country in which the business office of the grantee organization or contractor is located.  Note that this may be different from the research performance site</em>.<p>'))

In [230]:
df_us = df.loc[(is_nih) & (is_us)]

In [231]:
df_us.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171171 entries, 0 to 190504
Data columns (total 40 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   APPLICATION_ID          171171 non-null  int64         
 1   ACTIVITY                171171 non-null  object        
 2   ADMINISTERING_IC        171171 non-null  object        
 3   APPLICATION_TYPE        171171 non-null  float64       
 4   AWARD_NOTICE_DATE       171142 non-null  datetime64[ns]
 5   BUDGET_START            171171 non-null  datetime64[ns]
 6   BUDGET_END              171102 non-null  datetime64[ns]
 7   CORE_PROJECT_NUM        171171 non-null  object        
 8   FULL_PROJECT_NUM        171171 non-null  object        
 9   FUNDING_ICs             171171 non-null  object        
 10  FUNDING_MECHANISM       171171 non-null  object        
 11  FY                      171171 non-null  int64         
 12  IC_NAME                 171171

# Clean up some badly formatted zip codes

To properly map coordinates to zipcodes, zips must be in the correct format. The NIH dataset often has 

In [232]:
us_zips_non_dig = df_us['ORG_ZIPCODE'].str.contains('\D', na=False, regex=True)

In [233]:
df_us = df_us.copy()

In [234]:
df_us.loc[df_us['APPLICATION_ID'].isin(['9125677', '8837925', '9330009', '9837979', '9543896', '9994542']), 'ORG_ZIPCODE'] = '49418'
df_us.loc[df_us['APPLICATION_ID'].isin(['9686592', '9765856', '9806287', '9724114', '10044151', '10044353', '9916136', '9980531']), 'ORG_ZIPCODE'] = '98052'
df_us.loc[df_us['APPLICATION_ID'].isin(['9677029', '9804996', '9929994', '10055828']), 'ORG_ZIPCODE'] = '20170'
df_us.loc[df_us['APPLICATION_ID'].isin(['9805650', '10030922']), 'ORG_ZIPCODE'] = '80303'
df_us.loc[df_us['APPLICATION_ID'].isin(['9810864']), 'ORG_ZIPCODE'] = '21228'

In [235]:
len(df_us.loc[(df_us['ORG_ZIPCODE'].str.contains('\D', na=False, regex=True)) & (df_us['ORG_COUNTRY'] == 'UNITED STATES')]) == 0

True

### Zip code leading zeroes should be in place

Pad zeros on zips < 5 and > 9 and just take the first 5 chars

In [236]:
is_len_less_five = df_us['ORG_ZIPCODE'].str.len() < 5
print(f'DataFrame contained {len(df_us.loc[is_len_less_five])} rows with zipcodes < 5 digits')
df_us.loc[is_len_less_five, 'ORG_ZIPCODE'] = [s.rjust(5, '0')[:5] for idx, s in df_us.loc[is_len_less_five, 'ORG_ZIPCODE'].items()]
is_len_less_five = df_us['ORG_ZIPCODE'].str.len() < 5
df_us.loc[is_len_less_five].size == 0

DataFrame contained 73 rows with zipcodes < 5 digits


True

In [237]:
is_len_gt_five = df_us['ORG_ZIPCODE'].str.len() > 5
is_len_lt_nine = df_us['ORG_ZIPCODE'].str.len() < 9
print(f'DataFrame contained {len(df_us.loc[is_len_gt_five & is_len_lt_nine])} rows with zipcodes > 5 and < 9 digits')
df_us.loc[(is_len_gt_five), 'ORG_ZIPCODE'] = [s.rjust(9, '0')[:5] for idx, s in df_us.loc[(is_len_gt_five), 'ORG_ZIPCODE'].items()]
is_len_gt_five = df_us['ORG_ZIPCODE'].str.len() > 5
df_us.loc[(is_len_gt_five)].size == 0

DataFrame contained 10125 rows with zipcodes > 5 and < 9 digits


True

## Import ZIP to GEO data

The zip/geo data was [downloaded from ODS](https://public.opendatasoft.com/explore/dataset/us-zip-code-latitude-and-longitude/export/?dataChart=eyJxdWVyaWVzIjpbeyJjb25maWciOnsiZGF0YXNldCI6InVzLXppcC1jb2RlLWxhdGl0dWRlLWFuZC1sb25naXR1ZGUiLCJvcHRpb25zIjp7fX0sImNoYXJ0cyI6W3siYWxpZ25Nb250aCI6dHJ1ZSwidHlwZSI6ImNvbHVtbiIsImZ1bmMiOiJBVkciLCJ5QXhpcyI6ImxhdGl0dWRlIiwic2NpZW50aWZpY0Rpc3BsYXkiOnRydWUsImNvbG9yIjoiI0ZGNTE1QSJ9XSwieEF4aXMiOiJzdGF0ZSIsIm1heHBvaW50cyI6NTAsInNvcnQiOiIifV0sInRpbWVzY2FsZSI6IiIsImRpc3BsYXlMZWdlbmQiOnRydWUsImFsaWduTW9udGgiOnRydWV9&location=3,43.19717,-48.51562&basemap=jawg.streets) (Open Data Share). We will join this data on zipcode so we have longitude and latitude features for each row.

In [238]:
%%time
date_zip_imported = !date -Is
us_zip_code_latitude_and_longitude = './data/zips/us-zip-code-latitude-and-longitude.csv'
column_names=["Zip","City","State","Latitude","Longitude","Timezone","Daylight savings time flag","geopoint"]
zip_to_latlong = pd.read_csv(us_zip_code_latitude_and_longitude, sep=';', dtype={'Zip': 'str'}, header=0, names=column_names, encoding="ISO-8859-1")
date_zip_imported

CPU times: user 166 ms, sys: 49.1 ms, total: 215 ms
Wall time: 338 ms


['2020-05-21T14:11:45+00:00']

### Add some missing values to `zip_to_latlong`

We can eventually just save this to an updated file

In [239]:
zip_cols = zip_to_latlong.columns
listOfSeries = [pd.Series(['94158', None, None, 37.77244949, -122.39166260, None, None, None], index=zip_cols),
                pd.Series(['92617', None, None, 33.63830185, -117.84275055, None, None, None], index=zip_cols),
                pd.Series(['10065', None, None, 40.76429569, -73.96246150, None, None, None], index=zip_cols),
                pd.Series(['18902', None, None, 40.37361908, -75.06803894, None, None, None], index=zip_cols),
                pd.Series(['62712', None, None, 39.759095, -89.581855, None, None, None], index=zip_cols),
                pd.Series(['27268', None, None, 35.971691, -79.995012, None, None, None], index=zip_cols),
                pd.Series(['95757', None, None, 38.388294, -121.438706, None, None, None], index=zip_cols),
                pd.Series(['92011', None, None, 33.104738, -117.294838, None, None, None], index=zip_cols),
                pd.Series(['28035', None, None, 35.500264, -80.844537, None, None, None], index=zip_cols),
                pd.Series(['48193', None, None, 42.176885, -83.176072, None, None, None], index=zip_cols),
                pd.Series(['60491', None, None, 41.608073, -87.964632, None, None, None], index=zip_cols),
                pd.Series(['85142', None, None, 33.197122, -111.638108, None, None, None], index=zip_cols),
                pd.Series(['85209', None, None, 33.396080, -111.650097, None, None, None], index=zip_cols),
                pd.Series(['85755', None, None, 32.463827, -110.982601, None, None, None], index=zip_cols),
                pd.Series(['60642', None, None, 41.899644, -87.657551, None, None, None], index=zip_cols),
                pd.Series(['80113', None, None, 39.652369, -104.976232, None, None, None], index=zip_cols),
                pd.Series(['62711', None, None, 39.792576, -89.662249, None, None, None], index=zip_cols),
                pd.Series(['92010', None, None, 33.160609, -117.293587, None, None, None], index=zip_cols),
                pd.Series(['84096', None, None, 40.473425, -112.069600, None, None, None], index=zip_cols),
                pd.Series(['96913', None, None, 13.4686, 144.7989, None, None, None], index=zip_cols)]

In [240]:
zip_df = zip_to_latlong.append(listOfSeries, ignore_index=True)

In [241]:
df_us = df_us.join(zip_df.loc[:, ['Zip', 'Latitude', 'Longitude']].set_index('Zip'), on='ORG_ZIPCODE')

In [242]:
len(df_us.loc[(pd.isna(df_us['Latitude'])) & (pd.isna(df_us['ORG_ZIPCODE'])), ['ORG_ZIPCODE', 'APPLICATION_ID']]) < 10

True

### Fix some mispellings of cities

In [243]:
df_us.loc[df_us['ORG_CITY'] == 'san  francisco', 'ORG_CITY'] = 'san francisco'
df_us.loc[df_us['ORG_CITY'] == 'winston salem', 'ORG_CITY'] = 'winston-salem'
df_us.loc[df_us['ORG_CITY'] == 'st. louis', 'ORG_CITY'] = 'saint louis'
df_us.loc[df_us['ORG_CITY'] == 'st. paul', 'ORG_CITY'] = 'saint paul'
df_us.loc[df_us['ORG_CITY'] == 'res triangle', 'ORG_CITY'] = 'research triangle park'

# Export Data

In [244]:
%%time
date_exported = !date -Is
OUT = f'./out/csv/post_processed_{date_exported[0]}.csv.gzip'
df_us.to_csv(OUT, index=False, compression='gzip')

CPU times: user 56.3 s, sys: 717 ms, total: 57 s
Wall time: 1min


In [245]:
df_us.head()

Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CORE_PROJECT_NUM,FULL_PROJECT_NUM,FUNDING_ICs,FUNDING_MECHANISM,FY,IC_NAME,NIH_SPENDING_CATS,ORG_CITY,ORG_COUNTRY,ORG_DEPT,ORG_DISTRICT,ORG_NAME,ORG_STATE,ORG_ZIPCODE,PHR,PI_IDS,PI_NAMEs,PROGRAM_OFFICER_NAME,PROJECT_START,PROJECT_END,PROJECT_TERMS,PROJECT_TITLE,SERIAL_NUMBER,STUDY_SECTION,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT,fromfile,Latitude,Longitude
0,9527578,K99,HL,1.0,2018-04-20,2018-04-13,2019-02-28,K99HL138160,1K99HL138160-01A1,NHLBI:121328\,OTHER RESEARCH-RELATED,2018,"NATIONAL HEART, LUNG, AND BLOOD INSTITUTE",Bioengineering; Cardiovascular; Heart Disease;,DAVIS,UNITED STATES,PHARMACOLOGY,3.0,UNIVERSITY OF CALIFORNIA AT DAVIS,CA,95618,PROJECT NARRATIVE. Dysregulation of intracellu...,12094438;,"MOROTTI, STEFANO ;","WANG, WAYNE C",2018-04-13,2020-02-29,Action Potentials; Adrenergic Agents; Affect; ...,Modeling of subcellular signaling crosstalk in...,138160.0,MTI,NHLBI Mentored Transition to Independence Revi...,,A1,1.0,112341.0,8987.0,121328.0,,RePORTER_PRJ_C_FY2018.csv,38.546306,-121.68682
2,9520290,U01,ES,5.0,2018-06-21,2018-07-01,2019-06-30,U01ES026130,5U01ES026130-05,NCI:486353\NIEHS:375840\,Non-SBIR/STTR RPGs,2018,NATIONAL INSTITUTE OF ENVIRONMENTAL HEALTH SCI...,Breast Cancer; Cancer; Cancer Genomics; Clinic...,LOS ANGELES,UNITED STATES,PUBLIC HEALTH & PREV MEDICINE,33.0,UNIVERSITY OF CALIFORNIA LOS ANGELES,CA,90095,"PUBLIC HEALTH RELEVANCE: In this application,...",7551861; 12391665; 1896518 (contact); 1872364;,"BRODY, JULIA GREEN; CORVALAN, CAMILA ; MICHELS...","BOYLES, ABEE",2015-09-30,2020-06-30,Adult; adverse outcome; Advocate; Age; Age at ...,Environmental chemicals and postpubertal breas...,26130.0,ZES1,Special Emphasis Panel,,,5.0,896746.0,81743.0,862193.0,,RePORTER_PRJ_C_FY2018.csv,33.786594,-118.298662
3,9531351,K08,DK,5.0,2018-06-21,2018-07-01,2019-06-30,K08DK106427,5K08DK106427-04,NIDDK:164863\,OTHER RESEARCH-RELATED,2018,NATIONAL INSTITUTE OF DIABETES AND DIGESTIVE A...,Biomedical Imaging; Cardiovascular; Clinical R...,ROCHESTER,UNITED STATES,,1.0,MAYO CLINIC ROCHESTER,MN,55905,PUBLIC HEALTH RELEVANCE: There is a pressing n...,11777794;,"EIRIN, ALFONSO ;","RANKIN, TRACY L",2015-08-01,2020-06-30,Address; Animal Model; Apoptosis; Attenuated; ...,A potential role for mitoprotection in preserv...,106427.0,DDK,"Kidney, Urologic and Hematologic Diseases D Su...",,,4.0,152651.0,12212.0,164863.0,,RePORTER_PRJ_C_FY2018.csv,44.022513,-92.466826
4,9499301,R01,HL,1.0,2018-09-19,2018-09-15,2019-06-30,R01HL141379,1R01HL141379-01,NHLBI:542357\,Non-SBIR/STTR RPGs,2018,"NATIONAL HEART, LUNG, AND BLOOD INSTITUTE",Cancer; Clinical Research; Hematology; Rare Di...,DUARTE,UNITED STATES,,32.0,BECKMAN RESEARCH INSTITUTE/CITY OF HOPE,CA,91010,NARRATIVE While significant progresses have be...,8797736;,"CARLESSO, NADIA ;","BAI, C BRIAN",2018-09-15,2022-06-30,actionable mutation; Acute Myelocytic Leukemia...,Inflammation as determinant of clonal selectio...,141379.0,MCH,Molecular and Cellular Hematology Study Section,,,1.0,313501.0,228856.0,542357.0,,RePORTER_PRJ_C_FY2018.csv,34.137707,-117.96569
5,9546857,U54,GM,5.0,2018-09-03,2018-07-01,2019-09-22,U54GM119024,5U54GM119024-05,FIC:6651\NCATS:62846\NCCAM:11877\NCMHD:26527\N...,RESEARCH CENTERS,2018,NATIONAL INSTITUTE OF GENERAL MEDICAL SCIENCES,"Alcoholism, Alcohol Use and Health; Clinical R...",LOS ANGELES,UNITED STATES,INTERNAL MEDICINE/MEDICINE,33.0,UNIVERSITY OF CALIFORNIA LOS ANGELES,CA,90095,PUBLIC HEALTH RELEVANCE: The effective coordin...,8070940 (contact); 2404765; 1873294;,"NORRIS, KEITH C (contact); SEEMAN, TERESA E; W...","SESMA, MICHAEL A",2014-09-26,2019-09-22,Achievement; Address; Aging; Applied Research;...,NIH Diversity Program Consortium Coordination ...,119024.0,ZRG1,Special Emphasis Panel,,,5.0,3786860.0,930888.0,1780663.0,,RePORTER_PRJ_C_FY2018.csv,33.786594,-118.298662


In [246]:
OUT

'./out/csv/post_processed_2020-05-21T14:11:51+00:00.csv.gzip'

### Create a dtype dictionary for read_csv

Issue: Pandas gives warning: `DtypeWarning: Columns (13) have mixed types.Specify dtype option on import`. Pandas is trying to infer too many dtypes and running out of memory. 

[Solution on stackoverflow]() is to create a dataframe from the `dtypes` property and then change the type with `astype` to str (e.g., `dtype('int64') -> 'int64'`)

In [247]:
all_cols = set(df_us.columns)

In [248]:
dates = set(["AWARD_NOTICE_DATE","BUDGET_START","BUDGET_END", "PROJECT_START", "PROJECT_END"])

In [249]:
cols = list(all_cols - dates)

In [250]:
df_us.loc[:, cols].dtypes.apply(lambda x: x.name).to_dict()

{'ORG_CITY': 'object',
 'DIRECT_COST_AMT': 'float64',
 'fromfile': 'object',
 'ACTIVITY': 'object',
 'Latitude': 'float64',
 'CORE_PROJECT_NUM': 'object',
 'FUNDING_MECHANISM': 'object',
 'PI_IDS': 'object',
 'Longitude': 'float64',
 'ORG_STATE': 'object',
 'ADMINISTERING_IC': 'object',
 'STUDY_SECTION_NAME': 'object',
 'SERIAL_NUMBER': 'float64',
 'INDIRECT_COST_AMT': 'float64',
 'PHR': 'object',
 'TOTAL_COST': 'float64',
 'PROGRAM_OFFICER_NAME': 'object',
 'ORG_COUNTRY': 'object',
 'ORG_DEPT': 'object',
 'SUBPROJECT_ID': 'float64',
 'TOTAL_COST_SUB_PROJECT': 'float64',
 'IC_NAME': 'object',
 'ORG_ZIPCODE': 'object',
 'SUFFIX': 'object',
 'FULL_PROJECT_NUM': 'object',
 'NIH_SPENDING_CATS': 'object',
 'PROJECT_TITLE': 'object',
 'ORG_DISTRICT': 'float64',
 'APPLICATION_TYPE': 'float64',
 'PI_NAMEs': 'string',
 'APPLICATION_ID': 'int64',
 'FY': 'int64',
 'SUPPORT_YEAR': 'float64',
 'PROJECT_TERMS': 'string',
 'FUNDING_ICs': 'object',
 'STUDY_SECTION': 'object',
 'ORG_NAME': 'object'}