# Pull trace data from ATF site

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import pdfplumber
import numpy as np
import os

In [21]:
year = "2019"

In [22]:
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware",
          "District of Columbia","Florida","Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa",
          "Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota",
          "Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey",
          "New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon",
          "Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
          "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

In [23]:
cities = ["Little Rock AR","Los Angeles CA","San Francisco CA","Stockton CA","Washington DC","Chicago IL",
          "Baltimore MD","Detroit MI","Kansas City MO","Springfield MO","St Louis MO","Albuquerque NM",
          "New York City NY","Cleveland OH","Memphis TN"]

## 2021 and 2020

### Pull data from the text

In [24]:
general_numbers = pd.DataFrame()

In [25]:
for state in states:
    state_url = state.lower()
    state_url = state_url.replace(" ","-")
    state_url = state_url.replace("-of-","-")
    url = "https://www.atf.gov/resource-center/firearms-trace-data-"+state_url+"-"+year
    html = requests.get(url)
    html_content = html.content
    soup = BeautifulSoup(html_content)
    try:
        all_traces = soup.find(id="types").parent.contents[0]
        source_traces = soup.find(id="time-to-crime").parent.contents[1]
        national_ttc = soup.find(id="age").parent.contents[1].get_text()
        state_ttc = soup.find(id="age").parent.previous_sibling.previous_sibling.contents[1].get_text()
        national_avg_age = soup.find(id="recovery-cities").parent.contents[1].get_text()
        state_avg_age = soup.find(id="recovery-cities").parent.previous_sibling.previous_sibling.contents[1].get_text()
        d = {'state': [state],
             'year': [year],
             'all_traces': [all_traces], 
             'source_traces': [source_traces],
             'national_ttc': [national_ttc],
             'state_ttc': [state_ttc],
             'national_avg_age': [national_avg_age],
             'state_avg_age': [state_avg_age]}
        row = pd.DataFrame(data=d)
        general_numbers = pd.concat([general_numbers,row])
    except:
        d = {'state': [state],
             'year': [year],
             'all_traces': "", 
             'source_traces': "",
             'national_ttc': "",
             'state_ttc': "",
             'national_avg_age': "",
             'state_avg_age': ""}
        row = pd.DataFrame(data=d)
        general_numbers = pd.concat([general_numbers,row])

In [139]:
# # If the page is down
# url = "https://web.archive.org/web/20211127170156/https://www.atf.gov/resource-center/firearms-trace-data-pennsylvania-2020"
# html = requests.get(url)
# html_content = html.content
# soup = BeautifulSoup(html_content)
# all_traces = soup.find(id="types").parent.contents[0]
# source_traces = soup.find(id="time-to-crime").parent.contents[1]
# national_ttc = soup.find(id="age").parent.contents[1].get_text()
# state_ttc = soup.find(id="age").parent.previous_sibling.previous_sibling.contents[1].get_text()
# national_avg_age = soup.find(id="recovery-cities").parent.contents[1].get_text()
# state_avg_age = soup.find(id="recovery-cities").parent.previous_sibling.previous_sibling.contents[1].get_text()
# d = {'state': 'Pennsylvania',
#      'year': [year],
#      'all_traces': [all_traces], 
#      'source_traces': [source_traces],
#      'national_ttc': [national_ttc],
#      'state_ttc': [state_ttc],
#      'national_avg_age': [national_avg_age],
#      'state_avg_age': [state_avg_age]}
# row = pd.DataFrame(data=d)
# general_numbers = general_numbers.loc[general_numbers["state"] != "Pennsylvania"]
# general_numbers = pd.concat([general_numbers,row])

In [26]:
general_numbers = general_numbers.reset_index(drop=True)

In [27]:
general_numbers["national_ttc"] = general_numbers["national_ttc"].str.replace("Years","")
general_numbers["state_ttc"] = general_numbers["state_ttc"].str.replace("Years","")
general_numbers["national_avg_age"] = general_numbers["national_avg_age"].str.replace("Years","")
general_numbers["state_avg_age"] = general_numbers["state_avg_age"].str.replace("Years","")

In [28]:
general_numbers["source_traces_clean"] = general_numbers["source_traces"].str.split("identified in")
general_numbers["source_traces_clean"] = general_numbers["source_traces_clean"].str[-1]
general_numbers["source_traces_clean"] = general_numbers["source_traces_clean"].str.replace("total traces.","")

In [29]:
general_numbers.drop(columns=["source_traces"], inplace=True)

In [30]:
general_numbers.rename(columns={"source_traces_clean":"source_traces"}, inplace=True)

In [15]:
general_numbers.to_csv("data/raw/atf-trace-data/general-numbers-"+year+".csv", index=False)

### Pull data from the tables

In [32]:
firearm_types = pd.DataFrame()
calibers = pd.DataFrame()
categories = pd.DataFrame()
source_states = pd.DataFrame()
time_to_crime = pd.DataFrame()
possessor_age = pd.DataFrame()
recovery_cities = pd.DataFrame()

In [33]:
for state in states:
    state_url = state.lower()
    state_url = state_url.replace(" ","-")
    state_url = state_url.replace("-of-","-")
    url = "https://www.atf.gov/resource-center/firearms-trace-data-"+state_url+"-"+year
    try:
        tables = pd.read_html(url)
        for x in range(0,7):
            tables[x]["state"] = state
            tables[x]["year"] = year
        firearm_types = pd.concat([firearm_types, tables[0]])
        calibers = pd.concat([calibers, tables[1]])
        categories = pd.concat([categories, tables[2]])
        source_states = pd.concat([source_states, tables[3]])
        time_to_crime = pd.concat([time_to_crime, tables[4]])
        possessor_age = pd.concat([possessor_age, tables[5]])
        recovery_cities = pd.concat([recovery_cities, tables[6]])
    except:
        pass

In [171]:
# # If the page is down
# url = "https://web.archive.org/web/20211127170156/https://www.atf.gov/resource-center/firearms-trace-data-pennsylvania-2020"
# tables = pd.read_html(url)
# for x in range(0,7):
#     tables[x]["state"] = "Pennsylvania"
#     tables[x]["year"] = year
# firearm_types = pd.concat([firearm_types, tables[1]])
# calibers = pd.concat([calibers, tables[2]])
# categories = pd.concat([categories, tables[3]])
# source_states = pd.concat([source_states, tables[4]])
# time_to_crime = pd.concat([time_to_crime, tables[5]])
# possessor_age = pd.concat([possessor_age, tables[6]])
# recovery_cities = pd.concat([recovery_cities, tables[7]])

In [34]:
firearm_types

In [19]:
firearm_types.rename(columns={0:"firearm_type",1:"count"}, inplace=True)
calibers.rename(columns={0:"caliber",1:"count"}, inplace=True)
categories.rename(columns={0:"category",1:"count"}, inplace=True)
source_states.rename(columns={0:"source_state",1:"count"}, inplace=True)
time_to_crime.rename(columns={0:"time_to_crime",1:"count"}, inplace=True)
possessor_age.rename(columns={0:"possessor_age",1:"count"}, inplace=True)
recovery_cities.rename(columns={0:"recovery_city",1:"count"}, inplace=True)

In [20]:
firearm_types.to_csv("data/raw/atf-trace-data/firearm-types-"+year+".csv", index=False)
calibers.to_csv("data/raw/atf-trace-data/calibers-"+year+".csv", index=False)
categories.to_csv("data/raw/atf-trace-data/categories-"+year+".csv", index=False)
source_states.to_csv("data/raw/atf-trace-data/source-states-"+year+".csv", index=False)
time_to_crime.to_csv("data/raw/atf-trace-data/time-to-crime-"+year+".csv", index=False)
possessor_age.to_csv("data/raw/atf-trace-data/possessor-age-"+year+".csv", index=False)
recovery_cities.to_csv("data/raw/atf-trace-data/recovery-cities-"+year+".csv", index=False)

## Older years: 2019, 2018, 2017, 2016, 2015, 2014
Note: 2019 also has data for some cities, including Los Angeles

In [68]:
year = "2017"

### Download the pdfs

In [348]:
pdf_links = []

In [349]:
# Get all pdf links
r  = requests.get("https://www.atf.gov/resource-center/firearms-trace-data-"+year)
data = r.text
soup = BeautifulSoup(data)

for link in soup.find_all('a'):
    url = link.get('href')
    if url != None:
        if ('download' in url) & ('https://www.atf.gov' in url) & ('xlsx' not in url):
            pdf_links.append(url)

In [350]:
for link in pdf_links:
    url = link
    url_name = url.split('/')[-2]
    response = requests.get(url)
    with open('./data/raw/trace-data/'+year+'/'+url_name+'.pdf', 'wb') as f:
        f.write(response.content)

### Scrape the 2019, 2018, 2017 data

In [69]:
directory = os.listdir("data/raw/atf-trace-data/2017/")

In [70]:
year

'2017'

In [71]:
source_states = pd.DataFrame()
ttc_breakdown = pd.DataFrame()
general = pd.DataFrame()

for file in directory:
    try:
        pdf = pdfplumber.open("data/raw/atf-trace-data/2017/"+file)
        # Get the state
        p1 = pdf.pages[0]
        state = p1.extract_text_simple().split()[1]
        if (state == "New") | (state == "North") | (state == "South") | (state == "West") | (state == "District") | (state == "Rhode"):
            state = p1.extract_text_simple().split()[1:3]
            state = [' '.join(state)][0]
        # print(state)
        # Get the total traces
        p3 = pdf.pages[2]
        total_traces = p3.extract_text_simple().split()[-2]
        # print(state)
        # # Get the top 15 source states
        # p7 = pdf.pages[6]
        # table = p7.extract_table()
        # table_list = list(np.concatenate(table).flat)
        # source_states_list = []
        # counts_list = []
        # for s in table_list:
        #     if s != None:
        #         # State
        #         n = re.findall("[a-zA-Z]*", s)
        #         n = [' '.join(n)]
        #         n = n[0].strip()
        #         source_states_list.append(n)
        #         # Count
        #         c = re.findall("[0-9]",s)
        #         c = [''.join(c)]
        #         c = c[0]
        #         counts_list.append(c)
        # source_states_list = list(filter(None, source_states_list))
        # counts_list = list(filter(None, counts_list))
        # d1 = {'source_state': source_states_list, 'count': counts_list}
        # source_states_row = pd.DataFrame(data=d1)
        # source_states_row["state"] = state
        # source_states_row["year"] = year
        # source_states = pd.concat([source_states, source_states_row])
        print(state)
        # Get source traces
        source_traces = p7.extract_text_lines()[-2].get('text')
        source_traces = source_traces.split("identified in")[-1].replace("total traces.","").strip()
        # Get the time-to-crime
        p8 = pdf.pages[7]
        national_ttc = p8.extract_text_lines()[-2].get('text').split()[-2]
        state_ttc = p8.extract_text_lines()[-3].get('text').split()[-2]
        # Get the time-to-crime breakdown
        ttc_table = p8.extract_table()
        ttc_df = pd.DataFrame(data = {'time_to_crime':ttc_table[0], 'count':ttc_table[1]})
        ttc_df["state"] = state
        ttc_df["year"] = year
        ttc_breakdown = pd.concat([ttc_breakdown, ttc_df])
        # Get the average age of possessor
        p9 = pdf.pages[8]
        national_age = p9.extract_text_lines()[-2].get('text').split()[-2]
        state_age = p9.extract_text_lines()[-3].get('text').split()[-2]
        d2 = {
            "state":[state],
            "year":[year],
            "all_traces":[total_traces],
            "source_traces":[source_traces],
            "national_ttc":[national_ttc],
            "state_ttc":[state_ttc],
            "national_avg_age":[national_age],
            "state_avg_age":[state_age],
        }
        general_row = pd.DataFrame(data=d2)
        general = pd.concat([general, general_row])
    except:
        print(file)
        pass

# source_states.to_csv("data/processed/atf-trace-data/source-states-"+year+".csv", index=False)
ttc_breakdown.to_csv("data/processed/atf-trace-data/time-to-crime-"+year+".csv", index=False)
# general.to_csv("data/processed/atf-trace-data/general-numbers-"+year+".csv", index=False)

130326.pdf
number-firearms-sourced-and-recovered-united-states-and-territories-2017.pdf
Alabama
Delaware
Idaho
Florida
South Carolina
South Dakota
130331.pdf
Alaska
North Carolina
Montana
Mississippi
Wyoming
District of
Kentucky
Iowa
Louisiana
North Dakota
130321.pdf
Rhode Island
Guam
Hawaii
130336.pdf
130351.pdf
Oregon
Michigan
Kansas
New Hampshire
Colorado
Puerto
Oklahoma
Massachusetts
Utah
Pennsylvania
Ohio
Minnesota
Virginia
Vermont
New York
Arkansas
Illinois
Nevada
U.S.
Connecticut
Washington
Missouri
Nebraska
New Mexico
Tennessee
Texas
West Virginia
Indiana
New Jersey
Maryland
Wisconsin
Georgia
Maine
California
Arizona


In [35]:
pdf = pdfplumber.open("data/raw/atf-trace-data/2019/146946.pdf")

In [36]:
p8 = pdf.pages[7]

In [37]:
table = p8.extract_table()

In [44]:
pd.DataFrame(data = {'time_to_crime':table[0], 'count':table[1]})

Unnamed: 0,time_to_crime,count
0,Under 3 Months,52
1,3 Months to\nUnder 7 Months,45
2,7 Months to\nUnder 1 Year,50
3,1 Year to\nUnder 2 years,103
4,2 Years to\nUnder 3 Years,85
5,3 Years and Over,544
