In [1]:
import requests
import pandas

Playing around with the USCIS immigration forms data gave this URL for a search result, which has PDF and CSVs that go back to 2012.

In [2]:
r = requests.get('https://www.uscis.gov/tools/reports-studies/immigration-forms-data?topic_id=20709&field_native_doc_issue_date_value%5Bvalue%5D%5Bmonth%5D=&field_native_doc_issue_date_value_1%5Bvalue%5D%5Byear%5D=&combined=&items_per_page=200')

In [3]:
r.status_code

200

Looks like the request was successful, let's get links to the data sources.

In [4]:
from bs4 import BeautifulSoup

BeautifulSoup is a library for HTML parsing, super useful for web scraping.

In [5]:
soup = BeautifulSoup(r.content)

Inspecting one of the links for a CSV, I see that it has these properties that I can query by.

In [6]:
csvs = soup.findAll("a", attrs={"type":"text/csv"})
len(csvs), csvs[0]

(29,
 <a href="https://www.uscis.gov/sites/default/files/USCIS/Resources/Reports%20and%20Studies/Immigration%20Forms%20Data/Naturalization%20Data/N400_performancedata_fy2019_qtr2.csv" length="20453" type="text/csv"> Form N-400, Application for Naturalization, by Category of Naturalization, Case Status, and USCIS Field Office Location (Fiscal Year 2019, 2nd Quarter, Jan. 1-Mar. 31, 2019)  (CSV, 20 KB)</a>)

Looks like we should cut off the first 4 lines. We can iterate through all the HTML objects and query on the link it's linking to, saving to files. And here's how we can name them.

In [7]:
csvs[0].text.split('(')[1].split(')')[0]

'Fiscal Year 2019, 2nd Quarter, Jan. 1-Mar. 31, 2019'

In [8]:
# another quirk of the data is it's encoded in a less-commonly used encoding, discovered what it is by trying a few
encoding="windows-1252"

In [9]:
dfs = []
for obj in csvs:
    text = requests.get(obj.get('href')).content.decode(encoding)
    csv_fname = obj.text.split('(')[1].split(')')[0]
    csv_fname = csv_fname.replace("Fiscal Year", "FY")
    csv_fname = csv_fname.replace("Quarter", "Q")
    with open('data/USCIS/' + csv_fname + '.csv', 'w') as f:
        f.write(text)

## Concatenate raw CSVs

In [10]:
import os
os.listdir('data/USCIS')

['FY 2013, 4th Q, July 1- Sept. 30, 2013.csv',
 'FY 2014, 1st Q, Oct. 1-Dec. 31, 2013.csv',
 'FY 2014, 2nd Q, Jan. 1-March 31, 2014.csv',
 'FY 2014, 3rd Q, April 1-June 30, 2014.csv',
 'FY 2014, 4th Q, July 1-Sept. 30, 2014.csv',
 'FY 2015, 1st Q, Oct. 1-Dec. 31, 2014.csv',
 'FY 2015, 2nd Q, Jan. 1-March 31, 2015.csv',
 'FY 2015, 3rd Q, April 1-June 30, 2015.csv',
 'FY 2015, 4th Q, July 1-Sept. 30, 2015.csv',
 'FY 2016, 1st Q, Oct. 1-Dec.31, 2015.csv',
 'FY 2016, 2nd Q, Jan. 1-March 31, 2016.csv',
 'FY 2016, 3rd Q, April 1-June 30, 2016.csv',
 'FY 2016, 4th Q, July 1-Sept. 30, 2016.csv',
 'FY 2017, 1st Q, Oct. 1-Dec. 31, 2016.csv',
 'FY 2017, 2nd Q, Jan. 1-March 31, 2017.csv',
 'FY 2017, 3rd Q, April 1-June 30, 2017.csv',
 'FY 2017, 4th Q, July 1-Sept. 30, 2017.csv',
 'FY 2018, 1st Q, Oct. 1-Dec. 31, 2017.csv',
 'FY 2018, 2nd Q, Jan. 1-March 31, 2018.csv',
 'FY 2018, 3rd Q, April 1-June 30, 2018.csv',
 'FY 2018, 4th Q, July 1-Sept. 30, 2018.csv',
 'FY 2019, 1st Q, Oct. 1-Dec. 31, 2018.

Looks like the format is different for some of them, let's take a peek at the first 10 lines of each file.

The field office city is always the 2nd column (for all the FY CSVs post-2013 Q4), so that's good. Let's try loading one into a CSV and dropping all the rows without field office names. Pandas columns are 0-indexed so the second column will be called the 1st.

I spend a bunch of time trying to automatically extract the correct column names before realizing I could save time by just deleting everything above what I wanted to use. That's what the `data/USCIS/massaged` folder is.

In [11]:
dfs = []

for fname in os.listdir('data/USCIS/massaged'):
    if 'FYs' in fname:
        # skip the 2010-2013 CSVs for now
        continue
    df = pandas.read_csv('data/USCIS/massaged/' + fname, header=0, encoding=encoding)
    # make sure whitespace is remove so all column names are exactly the same
    df.columns = [name.strip() for name in df.columns]
    # rename the column
    df = df.rename(columns={'Unnamed: 1':"field_office"})
    # remove the whitespace from the names
    df['field_office'] = df.field_office.map(lambda name: name.strip() if type(name) == str else float('nan'))
    # keep track of which quarter this is from
    df['quarter'] = fname.split('.csv')[0]
    # fill the state downwards so it's on each row until the next state
    df['state'] = df[df.columns[0]].fillna(axis=0, method='ffill').map(lambda name: name.strip() if type(name) == str else float('nan'))
    # now we can drop any rows that have no city specified
    df.dropna(subset=['field_office'], inplace=True)
    
    dfs.append(df)
    # keep track of how many columns are in each quarter's CSV and whether there are any non-unique column names (looks like there aren't)
    print(fname, len(df.columns), len(set(df.columns)))

FY 2013, 4th Q, July 1- Sept. 30, 2013.csv 16 16
FY 2014, 1st Q, Oct. 1-Dec. 31, 2013.csv 16 16
FY 2014, 2nd Q, Jan. 1-March 31, 2014.csv 16 16
FY 2014, 3rd Q, April 1-June 30, 2014.csv 16 16
FY 2014, 4th Q, July 1-Sept. 30, 2014.csv 16 16
FY 2015, 1st Q, Oct. 1-Dec. 31, 2014.csv 16 16
FY 2015, 2nd Q, Jan. 1-March 31, 2015.csv 17 17
FY 2015, 3rd Q, April 1-June 30, 2015.csv 17 17
FY 2015, 4th Q, July 1-Sept. 30, 2015.csv 17 17
FY 2016, 1st Q, Oct. 1-Dec.31, 2015.csv 17 17
FY 2016, 2nd Q, Jan. 1-March 31, 2016.csv 17 17
FY 2016, 3rd Q, April 1-June 30, 2016.csv 17 17
FY 2016, 4th Q, July 1-Sept. 30, 2016.csv 17 17
FY 2017, 1st Q, Oct. 1-Dec. 31, 2016.csv 17 17
FY 2017, 2nd Q, Jan. 1-March 31, 2017.csv 17 17
FY 2017, 3rd Q, April 1-June 30, 2017.csv 17 17
FY 2017, 4th Q, July 1-Sept. 30, 2017.csv 17 17
FY 2018, 1st Q, Oct. 1-Dec. 31, 2017.csv 17 17
FY 2018, 2nd Q, Jan. 1-March 31, 2018.csv 17 17
FY 2018, 3rd Q, April 1-June 30, 2018.csv 17 17
FY 2018, 4th Q, July 1-Sept. 30, 2018.csv 17 

In [12]:
df = pandas.concat(dfs)
# delete columns we don't care about, like military naturalizations
del df['Unnamed: 0']
del df['Applications Received2 .1']
del df['Applications Received2.1']
del df['Applications Received2.2']
del df['Applications Received2 .2']
del df['Approved3.1']
del df['Approved3 .1']
del df['Approved3 .2']
del df['Denied4.1']
del df['Denied4 .2']
del df['Pending5.1']
del df['Approved3.2']
del df['Denied4 .1']
del df['Denied4.2']
del df['Pending5 .1']
del df['Pending5 .2']
del df['Pending5.2']
del df['USCIS Field Office or Service Center Location']
df.rename(columns={'Unnamed: 2':'code'}, inplace=True)
df['code'] = df['code'].map(lambda code: code.strip() if type(code) == str else float('nan'))
df

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code
11,Anchorage,257,316,39,365,"FY 2013, 4th Q, July 1- Sept. 30, 2013",Alaska,
15,Phoenix,2759,2399,312,4700,"FY 2013, 4th Q, July 1- Sept. 30, 2013",Arizona,
17,Tucson,636,513,80,1166,"FY 2013, 4th Q, July 1- Sept. 30, 2013",Arizona,
21,Fort Smith,232,202,22,411,"FY 2013, 4th Q, July 1- Sept. 30, 2013",Arkansas,
25,Chula Vista,1582,1928,121,2004,"FY 2013, 4th Q, July 1- Sept. 30, 2013",California,
...,...,...,...,...,...,...,...,...
419,Moscow,,,,,"FY 2019, 2nd Q, Jan. 1-Mar. 31, 2019",Russia,
423,Johannesburg,,,,,"FY 2019, 2nd Q, Jan. 1-Mar. 31, 2019",South Africa,
427,Seoul,,,,,"FY 2019, 2nd Q, Jan. 1-Mar. 31, 2019",South Korea,
431,Bangkok,,,,,"FY 2019, 2nd Q, Jan. 1-Mar. 31, 2019",Thailand,


Make sure that the state is correct by looking at all the state,city pairs

In [13]:
printed = set()
for i,row in df.iterrows():
    full_name = f'{row.field_office}, {row.state}'
    if full_name in printed:
        continue
    printed.add(full_name)
    print(full_name)

Anchorage, Alaska
Phoenix, Arizona
Tucson, Arizona
Fort Smith, Arkansas
Chula Vista, California
Fresno, California
Imperial, California
Los Angeles, California
Los Angeles County, California
Sacramento, California
San Bernardino, California
San Diego, California
San Fernando Valley, California
San Francisco, California
San Jose, California
Santa Ana, California
Denver, Colorado
Hartford, Connecticut
Dover AFB, Delaware
Washington, District of Columbia
Hialeah, Florida
Jacksonville, Florida
Kendall, Florida
Miami, Florida
Oakland Park, Florida
Orlando, Florida
Tampa, Florida
West Palm Beach, Florida
Atlanta, Georgia
Honolulu, Hawaii
Boise, Idaho
Chicago, Illinois
Indianapolis, Indiana
Des Moines, Iowa
Wichita, Kansas
Louisville, Kentucky
New Orleans, Louisiana
Portland, Maine
Baltimore, Maryland
Boston, Massachusetts
Lawrence, Massachusetts
Detroit, Michigan
St. Paul, Minnesota
Kansas City, Missouri
St. Louis, Missouri
Helena, Montana
Omaha, Nebraska
Las Vegas, Nevada
Reno, Nevada
Manch

---

Everything looks right

## Add in year + FY

In [14]:
df['year'] = df.quarter.map(lambda q: q[-4:])
df['fy'] = df.quarter.map(lambda q: q.split(',')[0])
df.sample(5)

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,year,fy
199,New York,6593.0,6244.0,1599.0,29542.0,"FY 2018, 1st Q, Oct. 1-Dec. 31, 2017",New York,NYC,2017,FY 2018
241,Salt Lake City,1274.0,1118.0,78.0,1459.0,"FY 2015, 3rd Q, April 1-June 30, 2015",Utah,SLC,2015,FY 2015
351,Santo Domingo,,,,,"FY 2018, 4th Q, July 1-Sept. 30, 2018",Dominican Republic,,2018,FY 2018
203,Oklahoma City,800.0,287.0,35.0,2056.0,"FY 2014, 3rd Q, April 1-June 30, 2014",Oklahoma,,2014,FY 2014
401,Amman,,,,,"FY 2018, 1st Q, Oct. 1-Dec. 31, 2017",Jordan,,2017,FY 2018


## Adding in older CSVs

First we'll need a way of looking up the city given the field office code.

In [15]:
code_to_city = {}
code_to_state = {}
city_to_code = {}
for i,entry in df.dropna(subset=['code']).iterrows():
#     print(entry.field_office, entry.code)
    code_to_city[entry.code.strip()] = entry.field_office.strip()
    code_to_state[entry.code.strip()] = entry.state.strip()
    city_to_code[entry.field_office.strip()] = entry.code.strip()
code_to_city['ANC'], code_to_state['ANC']

('Anchorage', 'Alaska')

There are a few missing entries I discovered when running the code below, so I'll put them in here.

In [16]:
# discovered a few missing entries
code_to_city['CSC'] = float('nan')
code_to_state['CSC'] = 'California'

code_to_city['NBC'] = 'National Benefits Center'
code_to_state['NBC'] = float('nan')

code_to_city['NSC'] = 'NEBRASKA SERVICE CENTER'
code_to_state['NSC'] = 'Nebraska'

code_to_city['VSC'] = 'VERMONT SERVICE CENTER'
code_to_state['VSC'] = 'Vermont'

code_to_city['TSC'] = 'TEXAS SERVICE CENTER'
code_to_state['TSC'] = 'Texas'

# don't know what this is, and it doesn't have a label
code_to_city['GCU'] = "Garden City"
code_to_state['GCU'] = "New York"

This opens the single CSV covering the fiscal years 2010, 2011, 2012, and 2013, cleans up and/or renames columns for concatenation with the dataframe from above.

In [17]:
field_office_code = 'code'

df2 = pandas.read_csv('data/USCIS/massaged/FYs 2010-2013, Oct. 2009-Dec. 2012.csv', encoding=encoding)
df2.dropna(subset=[field_office_code], inplace=True)
df2['state'] = df2[field_office_code].map(lambda code: code_to_state[code])
df2['field_office'] = df2[field_office_code].map(lambda code: code_to_city[code])
df2['Fiscal Year'].fillna(method='ffill', inplace=True)
df2.rename(columns={'Fiscal Year':'fy'}, inplace=True)
df2['fy'] = df2.fy.map(lambda y: 'FY ' + y)

for col in [' Receipts .1', ' Approvals .1', ' Denials .1',
       ' Pending .1', ' Receipts .2', ' Pending .2']:
    del df2[col]
df2.columns = [name.strip() for name in df2.columns]
df2

Unnamed: 0,fy,code,Description,Receipts,Approvals,Denials,Pending,Completions,state,field_office
1,FY 2010,ABQ,ALBUQUERQUE NM,1657,1560,75,604,1657,New Mexico,Albuquerque
3,FY 2010,AGA,AGANA GUAM,672,650,55,304,765,Guam,Agana
5,FY 2010,ALB,ALBANY NY,2018,1809,123,774,1936,New York,Albany
7,FY 2010,ANC,ANCHORAGE AK,956,775,98,385,912,Alaska,Anchorage
9,FY 2010,ATL,ATLANTA GA,21342,19368,1766,8938,21937,Georgia,Atlanta
...,...,...,...,...,...,...,...,...,...,...
699,FY 2013,TUC,TUCSON AZ/SUB,496,554,102,970,666,Arizona,Tucson
701,FY 2013,WAS,WASHINGTON DC,3988,3955,368,9426,4342,District of Columbia,Washington
703,FY 2013,WIC,WICHITA KS,258,178,31,543,210,Kansas,Wichita
705,FY 2013,WPB,WEST PALM BEACH FL,2631,2790,574,6182,3376,Florida,West Palm Beach


Before we combine these, let's make sure there's a code on each row.

In [18]:
df['code'] = df.field_office.map(lambda city: city_to_code[city] if city in city_to_code else float('nan'))

In [19]:
df3 = pandas.concat([df, df2])
df3.sort_values(by='fy')

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,year,fy,Description,Receipts,Approvals,Denials,Pending,Completions
1,Albuquerque,,,,,,New Mexico,ABQ,,FY 2010,ALBUQUERQUE NM,1657,1560,75,604,1657
125,Providence,,,,,,Rhode Island,PRO,,FY 2010,PROVIDENCE RI,2510,2053,189,949,2272
123,Portland,,,,,,Oregon,POO,,FY 2010,PORTLAND OR,6040,5312,311,2341,5659
121,Portland,,,,,,Maine,POM,,FY 2010,PORTLAND ME,888,822,62,245,900
119,Pittsburgh,,,,,,Pennsylvania,PIT,,FY 2010,PITTSBURGH PA,2509,2445,70,976,2534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,Boise,463,366,64,695,"FY 2019, 2nd Q, Jan. 1-Mar. 31, 2019",Idaho,BOI,2019,FY 2019,,,,,,
93,Honolulu,851,935,146,3446,"FY 2019, 2nd Q, Jan. 1-Mar. 31, 2019",Hawaii,HHW,2019,FY 2019,,,,,,
89,Atlanta,4861,5539,391,22458,"FY 2019, 2nd Q, Jan. 1-Mar. 31, 2019",Georgia,ATL,2019,FY 2019,,,,,,
83,Tampa,2989,3456,533,4742,"FY 2019, 2nd Q, Jan. 1-Mar. 31, 2019",Florida,TAM,2019,FY 2019,,,,,,


We have columns with the same name that could be converted to numeric values

In [20]:
def add_columns(_df, col1, col2):
    return pandas.to_numeric(_df[col1].fillna('0').str.replace(',',''), errors='coerce') + pandas.to_numeric(_df[col2].fillna('0').str.replace(',',''), errors='coerce')

In [21]:
df3['Applications Received2'] = add_columns(df3, 'Applications Received2', 'Receipts')
df3['Approved3'] = add_columns(df3, 'Approved3', 'Approvals')
df3['Denied4'] = add_columns(df3, 'Denied4', 'Denials')
df3['Pending5'] = add_columns(df3, 'Pending5', 'Pending')

del df3['Receipts']
del df3['Approvals']
del df3['Denials']
del df3['Pending']

The data transitions from yearly to quarterly in 2013, and it's not immediately clear if there's overlap or not.

In [22]:
df3[(df3.fy == 'FY 2013') & (df3.code == 'ANC')]

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,year,fy,Description,Completions
11,Anchorage,257.0,316.0,39.0,365.0,"FY 2013, 4th Q, July 1- Sept. 30, 2013",Alaska,ANC,2013.0,FY 2013,,
537,Anchorage,258.0,287.0,52.0,368.0,,Alaska,ANC,,FY 2013,ANCHORAGE AK,349.0


In [23]:
df.sample(5)

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,year,fy
153,Omaha,688,555,59,1044,"FY 2015, 1st Q, Oct. 1-Dec. 31, 2014",Nebraska,OMA,2014,FY 2015
197,Long Island,3639,4513,499,12306,"FY 2018, 1st Q, Oct. 1-Dec. 31, 2017",New York,LNY,2017,FY 2018
241,Salt Lake City,907,972,88,2398,"FY 2013, 4th Q, July 1- Sept. 30, 2013",Utah,SLC,2013,FY 2013
265,Milwaukee,1207,320,75,3034,"FY 2017, 1st Q, Oct. 1-Dec. 31, 2016",Wisconsin,MIL,2016,FY 2017
11,Anchorage,311,215,35,501,"FY 2015, 1st Q, Oct. 1-Dec. 31, 2014",Alaska,ANC,2014,FY 2015


In [24]:
df3.sample(5)

Unnamed: 0,field_office,Applications Received2,Approved3,Denied4,Pending5,quarter,state,code,year,fy,Description,Completions
113,Louisville,711.0,608.0,113.0,961.0,"FY 2015, 1st Q, Oct. 1-Dec. 31, 2014",Kentucky,LOU,2014,FY 2015,,
29,Imperial,387.0,258.0,61.0,626.0,"FY 2016, 2nd Q, Jan. 1-March 31, 2016",California,IMP,2016,FY 2016,,
69,Jacksonville,970.0,621.0,77.0,1963.0,"FY 2015, 1st Q, Oct. 1-Dec. 31, 2014",Florida,JAC,2014,FY 2015,,
331,Beijing,,,,,"FY 2016, 4th Q, July 1-Sept. 30, 2016",China,,2016,FY 2016,,
41,San Fernando Valley,3412.0,2492.0,221.0,6992.0,"FY 2016, 1st Q, Oct. 1-Dec.31, 2015",California,SFV,2015,FY 2016,,


In [25]:
del df3['Description']
del df3['Completions']

In [26]:
len(df3), len(df)

(2639, 2288)

In [27]:
df3.to_csv('data/first_pass.csv')