# Load XML 

In [2]:
import xml.etree.ElementTree as ElementTree
import os
import io
import pandas as pd
import numpy as np

In [5]:
def get_yearly_df(path):
    print("File is processing...")
    df_nsf = pd.DataFrame()   
    yearly_file = os.listdir(path)  
    for file in yearly_file:
        file_path = path + "/" + file
        try:
            tree = ElementTree.parse(file_path)
            root = tree.getroot()
            lists = []
            award = {}
            award['award_id'] = root.find('Award/AwardID').text
            try:
                award['title'] = root.find('Award/AwardTitle').text
            except:
                award['title'] = None
            try: 
                award['amount'] = root.find('Award/AwardAmount').text
            except:
                award['amount'] = None
            try:
                award['institution'] = root.find('Award/Institution/Name').text
            except:
                award['institution'] = None
            try:
                award['effective_date'] = root.find('Award/AwardEffectiveDate').text
            except:
                award['effective_date'] = None
            try:
                award['expiration_date'] = root.find('Award/AwardExpirationDate').text
            except:
                award['expiration_date'] = None
            try:
                award['abstract'] = root.find('Award/AbstractNarration').text
            except:
                award['abstract'] = None
            try:
                award['program_element_code'] = root.find('Award/ProgramElement/Code').text
            except:
                award['program_element_code'] = None
            try:
                award['program_element_name'] = root.find('Award/ProgramElement/Text').text
            except:
                award['program_element_name'] = None
            try:
                award['directorate'] = root.find('Award/Organization/Directorate/Abbreviation').text  
            except:
                award['directorate'] = None
            try:
                award['division_code'] = root.find('Award/Organization/Division/Abbreviation').text
            except:
                award['division_code'] = None
            try:
                award['division_name'] = root.find('Award/Organization/Division/LongName').text
            except:
                award['division_name'] = None
            try:
                award['investigator'] = root.find('Award/Investigator/PI_FULL_NAME').text
            except:
                award['investigator'] = None
            try:
                award['state_name'] = root.find('Award/Performance_Institution/StateName').text
            except:
                award['state_name'] = None
            try:
                award['state_code'] = root.find('Award/Performance_Institution/StateCode').text
            except:
                award['state_code'] = None           
            try:   
                award['ProgramReference'] = []
                for program in root.findall('.//ProgramReference'):
                    award['ProgramReference'].append(program.find('./Text').text) 
            except:
                award['ProgramReference'] = None
            # Appends dictionary inside a list and converts to a data frame.
            lists.append(award)
            df = pd.DataFrame(lists)
            df_nsf = pd.concat([df_nsf, df], sort= False)
        except:
            print(file_path)
            
    # Arranges the index column
    df_nsf.reset_index(drop=True, inplace=True)
    print("Complete")
    
    return df_nsf

In [None]:
df_2021 = get_yearly_df('./years/2021/')

In [None]:
df_2020 = get_yearly_df('./years/2020/')
df_2019 = get_yearly_df('./years/2019/')
df_2018 = get_yearly_df('./years/2018/')
df_2017 = get_yearly_df('./years/2017/')
df_2016 = get_yearly_df('./years/2016/')
df_2015 = get_yearly_df('./years/2015/')
df_2014 = get_yearly_df('./years/2014/')
df_2013 = get_yearly_df('./years/2013/')
df_2012 = get_yearly_df('./years/2012/')

In [None]:
df_2012 = get_yearly_df('./years/2012/')
df_2011 = get_yearly_df('./years/2011/')
df_2010 = get_yearly_df('./years/2010/')
df_2009 = get_yearly_df('./years/2009/')
df_2008 = get_yearly_df('./years/2008/')
df_2007 = get_yearly_df('./years/2007/')
df_2006 = get_yearly_df('./years/2006/')
df_2005 = get_yearly_df('./years/2005/')
df_2004 = get_yearly_df('./years/2004/')
df_2003 = get_yearly_df('./years/2003/')
df_2002 = get_yearly_df('./years/2002/')
df_2001 = get_yearly_df('./years/2001/')
df_2000 = get_yearly_df('./years/2000/')

# Save DataFrame to csv

In [None]:
df_2021.to_csv('years_csv/df_2021.csv')

In [9]:
df_2020.to_csv('years_csv/df_2020.csv')
df_2019.to_csv('years_csv/df_2019.csv')
df_2018.to_csv('years_csv/df_2018.csv')
df_2017.to_csv('years_csv/df_2017.csv')
df_2016.to_csv('years_csv/df_2016.csv')
df_2015.to_csv('years_csv/df_2015.csv')
df_2014.to_csv('years_csv/df_2014.csv')
df_2013.to_csv('years_csv/df_2013.csv')
df_2012.to_csv('years_csv/df_2012.csv')

In [5]:
df_2012.to_csv('years_csv/df_2012.csv')
df_2011.to_csv('years_csv/df_2011.csv')
df_2010.to_csv('years_csv/df_2010.csv')
df_2009.to_csv('years_csv/df_2009.csv')
df_2008.to_csv('years_csv/df_2008.csv')
df_2007.to_csv('years_csv/df_2007.csv')
df_2006.to_csv('years_csv/df_2006.csv')
df_2005.to_csv('years_csv/df_2005.csv')
df_2004.to_csv('years_csv/df_2004.csv')
df_2003.to_csv('years_csv/df_2003.csv')
df_2002.to_csv('years_csv/df_2002.csv')
df_2001.to_csv('years_csv/df_2001.csv')
df_2000.to_csv('years_csv/df_2000.csv')