In [1]:
import pandas

In [2]:
q_sheet_to_name = {
    "fy2019_q4": "FY 2019, 4th Q, July 1- Sept. 30, 2019",
    "fy2019_q3": "FY 2019, 3rd Q, April 1 - June 30, 2019",
    "fy2020_q1": "FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019"
}

In [3]:
dfs = []
for sheet_name, quarter in q_sheet_to_name.items():
    df = pandas.read_excel('data/USCIS/Naturalizations by field office.xlsx', sheet_name=sheet_name, 
                           skiprows=8, usecols="A:G")
    df.columns = ['state', 'field_office', 'code', 
                  'Applications Received2', 'Approved3', 'Denied4', 'Pending5']

    df['state'] = df[df.columns[0]].fillna(axis=0, method='ffill').map(lambda name: name.strip() if type(name) == str else float('nan'))
    df = df.rename(columns={'Unnamed: 1':"field_office"})
    df['field_office'] = df.field_office.map(lambda name: name.strip() if type(name) == str else float('nan'))
    df['quarter'] = quarter
    df['year'] = df.quarter.map(lambda q: q[-4:])
    df['fy'] = df.quarter.map(lambda q: q.split(',')[0])
    df.rename(columns={'Unnamed: 2':'code'}, inplace=True)
    df['code'] = df['code'].map(lambda code: code.strip() if type(code) == str else float('nan'))
    
    dfs.append(df.dropna(subset=['field_office']))
pandas.concat(dfs).sample(3)

Unnamed: 0,state,field_office,code,Applications Received2,Approved3,Denied4,Pending5,quarter,year,fy
98,Ohio,Cleveland,CLE,1206,1283,84,1601,"FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019",2019,FY 2020
113,Tennessee,Memphis,MEM,677,766,63,1602,"FY 2019, 4th Q, July 1- Sept. 30, 2019",2019,FY 2019
47,Illinois,Chicago,CHI,7622,7710,1141,20698,"FY 2019, 4th Q, July 1- Sept. 30, 2019",2019,FY 2019


In [4]:
new_data = pandas.concat(dfs)[['field_office',  'Applications Received2', 'Approved3', 
                    'Denied4', 'Pending5', 'quarter', 'state', 'code', 'fy']]
new_data.sample(3)

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,fy
71,St. Louis,872.0,982.0,63.0,1620.0,"FY 2019, 4th Q, July 1- Sept. 30, 2019",Missouri,STL,FY 2019
139,Agana,,,,,"FY 2019, 3rd Q, April 1 - June 30, 2019",Guam,AGA,FY 2019
120,Houston,,,,,"FY 2019, 3rd Q, April 1 - June 30, 2019",Texas,HOU,FY 2019


In [5]:
new_data[new_data['code'] == 'IMP']

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,fy
12,Imperial,561.0,390.0,91.0,935.0,"FY 2019, 4th Q, July 1- Sept. 30, 2019",California,IMP,FY 2019
13,Imperial,,,,,"FY 2019, 3rd Q, April 1 - June 30, 2019",California,IMP,FY 2019
12,Imperial,561.0,390.0,91.0,935.0,"FY 2020, 1st Q, Oct. 1 - Dec. 31, 2019",California,IMP,FY 2020


##### def try_convert(val):
    try:
        return float(val)
    except:
        return float(0)
for col_name in ['Applications Received2', 'Approved3', 'Denied4', 'Pending5']:
    new_data[col_name] = new_data[col_name].map(lambda val: try_convert(val))

In [7]:
master_df = pandas.read_csv('data/first_pass.csv', index_col=0)

In [8]:
master_df.sample(3)

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,year,fy
363,Boise,1317.0,1255.0,160.0,430.0,,Idaho,BOI,,FY 2012
213,Pittsburgh,563.0,601.0,31.0,972.0,"FY 2014, 1st Q, Oct. 1-Dec. 31, 2013",Pennsylvania,PIT,2013.0,FY 2014
59,Dover AFB,,,,,"FY 2014, 1st Q, Oct. 1-Dec. 31, 2013",Delaware,DVD,2013.0,FY 2014


In [9]:
new_master = pandas.concat([master_df, new_data])

In [10]:
len(master_df), len(new_data), len(new_master)

(2639, 324, 2963)

In [11]:
new_master.to_csv('data/master_df.csv')

In [12]:
new_master[new_master['code']=='IMP']

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,year,fy
29,Imperial,408.0,611.0,121.0,810.0,"FY 2013, 4th Q, July 1- Sept. 30, 2013",California,IMP,2013.0,FY 2013
29,Imperial,321.0,517.0,66.0,550.0,"FY 2014, 1st Q, Oct. 1-Dec. 31, 2013",California,IMP,2013.0,FY 2014
29,Imperial,357.0,337.0,53.0,546.0,"FY 2014, 2nd Q, Jan. 1-March 31, 2014",California,IMP,2014.0,FY 2014
29,Imperial,458.0,328.0,51.0,622.0,"FY 2014, 3rd Q, April 1-June 30, 2014",California,IMP,2014.0,FY 2014
29,Imperial,299.0,400.0,81.0,421.0,"FY 2014, 4th Q, July 1-Sept. 30, 2014",California,IMP,2014.0,FY 2014
29,Imperial,231.0,241.0,71.0,340.0,"FY 2015, 1st Q, Oct. 1-Dec. 31, 2014",California,IMP,2014.0,FY 2015
29,Imperial,314.0,215.0,42.0,399.0,"FY 2015, 2nd Q, Jan. 1-March 31, 2015",California,IMP,2015.0,FY 2015
29,Imperial,378.0,217.0,53.0,508.0,"FY 2015, 3rd Q, April 1-June 30, 2015",California,IMP,2015.0,FY 2015
29,Imperial,298.0,307.0,46.0,476.0,"FY 2015, 4th Q, July 1-Sept. 30, 2015",California,IMP,2015.0,FY 2015
29,Imperial,363.0,264.0,37.0,557.0,"FY 2016, 1st Q, Oct. 1-Dec.31, 2015",California,IMP,2015.0,FY 2016
