In [1]:
import pandas

In [2]:
q_sheet_to_name = {
    "fy2019_q4": "FY 2019, 4th Q, July 1- Sept. 30, 2019",
    "fy2019_q3": "FY 2019, 3rd Q, April 1 - June 30, 2019",
    "fy2020_q1": "FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019"
}

In [3]:
dfs = []
for sheet_name, quarter in q_sheet_to_name.items():
    df = pandas.read_excel('data/USCIS/Naturalizations by field office.xlsx', sheet_name=sheet_name, 
                           skiprows=8, usecols="A:I")
    df.columns = ['state', 'field_office', 'code', 'Field Office by State6', 'Field Office Code', 
                  'Applications Received2', 'Approved3', 'Denied4', 'Pending5']

    df['state'] = df[df.columns[0]].fillna(axis=0, method='ffill').map(lambda name: name.strip() if type(name) == str else float('nan'))
    df = df.rename(columns={'Unnamed: 1':"field_office"})
    df['field_office'] = df.field_office.map(lambda name: name.strip() if type(name) == str else float('nan'))
    df['quarter'] = quarter
    df['year'] = df.quarter.map(lambda q: q[-4:])
    df['fy'] = df.quarter.map(lambda q: q.split(',')[0])
    df.rename(columns={'Unnamed: 2':'code'}, inplace=True)
    df['code'] = df['code'].map(lambda code: code.strip() if type(code) == str else float('nan'))
    
    dfs.append(df.dropna(subset=['field_office']))
pandas.concat(dfs).sample(3)

Unnamed: 0,state,field_office,code,Field Office by State6,Field Office Code,Applications Received2,Approved3,Denied4,Pending5,quarter,year,fy
78,Nevada,Reno,REN,388.0,354.0,20.0,539.0,D,-,"FY 2019, 4th Q, July 1- Sept. 30, 2019",2019,FY 2019
141,U.S. Virgin Islands,Charlotte Amalie,CHA,,,,,,,"FY 2019, 3rd Q, April 1 - June 30, 2019",2019,FY 2019
110,South Carolina,Charleston,CHL,663.0,813.0,58.0,1157.0,24,26,"FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019",2019,FY 2020


In [4]:
new_data = pandas.concat(dfs)[['field_office',  'Applications Received2', 'Approved3', 
                    'Denied4', 'Pending5', 'quarter', 'state', 'code', 'fy']]
new_data.sample(3)

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,fy
141,Charlotte Amalie,,,,,"FY 2019, 3rd Q, April 1 - June 30, 2019",U.S. Virgin Islands,CHA,FY 2019
36,Oakland Park,415,10180,D,D,"FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019",Florida,OKL,FY 2020
10,Chula Vista,-,-,-,-,"FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019",California,CVC,FY 2020


In [5]:
def try_convert(val):
    try:
        return float(val)
    except:
        return float(0)
for col_name in ['Applications Received2', 'Approved3', 'Denied4', 'Pending5']:
    new_data[col_name] = new_data[col_name].map(lambda val: try_convert(val))

In [6]:
master_df = pandas.read_csv('data/master_df.csv', index_col=0)

In [7]:
master_df.sample(3)

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,year,fy
45,San Jose,4419.0,3367.0,285.0,5804.0,"FY 2014, 3rd Q, April 1-June 30, 2014",California,SNJ,2014.0,FY 2014
67,Fort Meyers,1203.0,382.0,20.0,2202.0,"FY 2017, 1st Q, Oct. 1-Dec. 31, 2016",Florida,OFM,2016.0,FY 2017
81,Orlando,2739.0,2939.0,511.0,9121.0,"FY 2019, 2nd Q, Jan. 1-Mar. 31, 2019",Florida,ORL,2019.0,FY 2019


In [8]:
new_master = pandas.concat([master_df, new_data])

In [9]:
len(master_df), len(new_data), len(new_master)

(2639, 324, 2963)

In [10]:
new_master.to_csv('data/master_df.csv')