In [3]:
import pandas

In [12]:
q_sheet_to_name = {
    "fy2019_q4": "FY 2019, 4th Q, July 1- Sept. 30, 2019",
    "fy2019_q3": "FY 2019, 3rd Q, April 1 - June 30, 2019",
    "fy2020_q1": "FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019",
    "fy2020_q2_values": "FY 2020, 2nd Q, Jan. 1 - March 31 2020"
}

In [20]:
dfs = []
for sheet_name, quarter in q_sheet_to_name.items():
    df = pandas.read_excel('data/USCIS/Naturalizations by field office.xlsx', sheet_name=sheet_name, 
                           skiprows=8, usecols="A:G")
    df.columns = ['state', 'field_office', 'code', 
                  'Applications Received2', 'Approved3', 'Denied4', 'Pending5']

    df['state'] = df[df.columns[0]].fillna(axis=0, method='ffill').map(lambda name: name.strip() if type(name) == str else float('nan'))
    df = df.rename(columns={'Unnamed: 1':"field_office"})
    df['field_office'] = df.field_office.map(lambda name: name.strip() if type(name) == str else float('nan'))
    df['quarter'] = quarter
    df['year'] = df.quarter.map(lambda q: q[-4:])
    df['fy'] = df.quarter.map(lambda q: q.split(',')[0])
    df.rename(columns={'Unnamed: 2':'code'}, inplace=True)
    df['code'] = df['code'].map(lambda code: code.strip() if type(code) == str else float('nan'))
    
    dfs.append(df.dropna(subset=['field_office']))
pandas.concat(dfs).sample(6)

Unnamed: 0,state,field_office,code,Applications Received2,Approved3,Denied4,Pending5,quarter,year,fy
95,North Carolina,Charlotte,CLT,,,,,"FY 2019, 3rd Q, April 1 - June 30, 2019",2019,FY 2019
35,Florida,Kendall,KND,,,,,"FY 2019, 3rd Q, April 1 - June 30, 2019",2019,FY 2019
16,California,San Bernardino,SBD,4835.0,5023.0,562.0,11225.0,"FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019",2019,FY 2020
46,Idaho,Boise,BOI,4080.0,4852.0,684.0,8272.0,"FY 2019, 3rd Q, April 1 - June 30, 2019",2019,FY 2019
70,Missouri,Kansas City,KAN,1269.0,1498.0,95.0,1683.0,"FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019",2019,FY 2020
128,Washington,Seattle,SEA,4603.0,2539.0,273.0,18639.0,"FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019",2019,FY 2020


In [21]:
new_data = pandas.concat(dfs)[['field_office',  'Applications Received2', 'Approved3', 
                    'Denied4', 'Pending5', 'quarter', 'state', 'code', 'fy']]
new_data.sample(3)

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,fy
100,Columbus,4280.0,4346.0,225.0,16802.0,"FY 2019, 3rd Q, April 1 - June 30, 2019",Ohio,CLM,FY 2019
99,Columbus,1731.0,1665.0,152.0,2737.0,"FY 2019, 4th Q, July 1- Sept. 30, 2019",Ohio,CLM,FY 2019
181,Ciudad Juarez,,,,,"FY 2019, 4th Q, July 1- Sept. 30, 2019",Mexico,,FY 2019


In [22]:
new_data[new_data['code'] == 'IMP']

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,fy
12,Imperial,561.0,390.0,91.0,935.0,"FY 2019, 4th Q, July 1- Sept. 30, 2019",California,IMP,FY 2019
13,Imperial,,,,,"FY 2019, 3rd Q, April 1 - June 30, 2019",California,IMP,FY 2019
12,Imperial,561.0,390.0,91.0,935.0,"FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019",California,IMP,FY 2020
10,Imperial,477.0,320.0,115.0,1296.0,"FY 2020, 2nd Q, Jan. 1 - March 31 2020",California,IMP,FY 2020


##### def try_convert(val):
    try:
        return float(val)
    except:
        return float(0)
for col_name in ['Applications Received2', 'Approved3', 'Denied4', 'Pending5']:
    new_data[col_name] = new_data[col_name].map(lambda val: try_convert(val))

Concat in any PDFs converted to CSV

In [24]:
master_df = pandas.read_csv('data/first_pass.csv', index_col=0)

In [25]:
master_df.sample(3)

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,year,fy
29,Chula Vista,,,,,"FY 2018, 3rd Q, April 1-June 30, 2018",California,CVC,2018.0,FY 2018
35,Sacramento,3427.0,2507.0,264.0,4183.0,"FY 2015, 2nd Q, Jan. 1-March 31, 2015",California,SAC,2015.0,FY 2015
235,Houston,5907.0,4204.0,312.0,17379.0,"FY 2014, 4th Q, July 1-Sept. 30, 2014",Texas,HOU,2014.0,FY 2014


In [26]:
new_master = pandas.concat([master_df, new_data])

In [27]:
len(master_df), len(new_data), len(new_master)

(2639, 439, 3078)

In [28]:
new_master.to_csv('data/master_df.csv')

In [29]:
new_master[new_master['code']=='IMP']

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,year,fy
29,Imperial,408.0,611.0,121.0,810.0,"FY 2013, 4th Q, July 1- Sept. 30, 2013",California,IMP,2013.0,FY 2013
29,Imperial,321.0,517.0,66.0,550.0,"FY 2014, 1st Q, Oct. 1-Dec. 31, 2013",California,IMP,2013.0,FY 2014
29,Imperial,357.0,337.0,53.0,546.0,"FY 2014, 2nd Q, Jan. 1-March 31, 2014",California,IMP,2014.0,FY 2014
29,Imperial,458.0,328.0,51.0,622.0,"FY 2014, 3rd Q, April 1-June 30, 2014",California,IMP,2014.0,FY 2014
29,Imperial,299.0,400.0,81.0,421.0,"FY 2014, 4th Q, July 1-Sept. 30, 2014",California,IMP,2014.0,FY 2014
29,Imperial,231.0,241.0,71.0,340.0,"FY 2015, 1st Q, Oct. 1-Dec. 31, 2014",California,IMP,2014.0,FY 2015
29,Imperial,314.0,215.0,42.0,399.0,"FY 2015, 2nd Q, Jan. 1-March 31, 2015",California,IMP,2015.0,FY 2015
29,Imperial,378.0,217.0,53.0,508.0,"FY 2015, 3rd Q, April 1-June 30, 2015",California,IMP,2015.0,FY 2015
29,Imperial,298.0,307.0,46.0,476.0,"FY 2015, 4th Q, July 1-Sept. 30, 2015",California,IMP,2015.0,FY 2015
29,Imperial,363.0,264.0,37.0,557.0,"FY 2016, 1st Q, Oct. 1-Dec.31, 2015",California,IMP,2015.0,FY 2016
