In [1]:
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from datetime import datetime, timedelta
from selenium import webdriver
import selenium
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import json

from urllib.parse import parse_qs, urlparse, urlsplit

In [2]:
DOMAIN_URL = "https://www.gunviolencearchive.org"
QUERY_URL = "https://www.gunviolencearchive.org/query"
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
FORMAT = "%m/%d/%Y"
QUERY_SEPERATOR = "?"

In [3]:
def get_query_url(driver, start_date, end_date):
    driver.get(QUERY_URL)
    
    # click add a rule
    add_rule_button = driver.find_element(By.CSS_SELECTOR, '.filter-dropdown-trigger')
    add_rule_button.click()
    
    # click date
    date_filter = WebDriverWait(driver, 50).until(EC.element_to_be_clickable(driver.find_element(By.LINK_TEXT, 'Date')))
    date_filter.click()
    
    # fill data fields
    date_from = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[id*='filter-field-date-from']")))
    date_to = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[id*='filter-field-date-to']")))
    script = f"arguments[0].setAttribute('value', '{start_date}');arguments[1].setAttribute('value', '{end_date}')"
    driver.execute_script(script, date_from, date_to)
    
    # click search
    search_button = driver.find_element(By.ID, 'edit-actions-execute')
    search_button.click()
    
    # sort date by ascending
#     incident_date_a = driver.find_element(By.CSS_SELECTOR, 'a[title="sort by Incident Date"]')
#     incident_date_href = incident_date_a.get_attribute('href');
#     driver.get(incident_date_href)
    
    return driver.current_url, get_n_pages(driver)

In [4]:
def get_n_pages(driver):
    try:
        # get link related to last button
        last_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[title="Go to last page"]')))
        last_url = last_button.get_attribute('href')
        
        # parse page number from page
        form_data = urlparse(last_url).query
        n_pages = int(parse_qs(form_data)['page'][0]) + 1

        return n_pages
    except NoSuchElementException:
        return 1

In [5]:
def convert_to_dataframe(table_rows):
    table = []
    for tr in table_rows:
        # find all elements in row
        td = tr.find_all('td')
        
        # add all elements but last into list
        row = [tr.text for tr in td[:(len(td)-1)]]
        
        # get incident url from last element in row
        incident_url = DOMAIN_URL + td[len(td)-1].find("a").get('href')
        row.append(incident_url)
        
        # append to table
        table.append(row)
    return pd.DataFrame(table, columns=["id", "date", "state", "city", "address", "n_killed", "n_injured", "incident_url"])

In [6]:
def scrape_query_url(url):
    # request and open url + create beautifulsoup object for scraping
    req = Request(url,headers=HEADERS)
    query_page = urlopen(req)
    soup = BeautifulSoup(query_page)
    
    # find table of page and find all table rows
    res = soup.find_all('tbody')[0].find_all("tr")
    return convert_to_dataframe(res)

In [None]:
# set driver to chrome
driver = Chrome(executable_path="/Users/chasemattingly/chromedriver/chromedriver")

query_pairs = []
step = timedelta(days=1)

# change variables to change scraping range
global_range = ["1/1/2022", "8/1/2022"]

global_start_date = datetime.strptime(global_range[0], FORMAT)
global_end_date = datetime.strptime(global_range[1], FORMAT)

while global_start_date < global_end_date:
    query_url, n_pages = get_query_url(driver, global_start_date.strftime(FORMAT), (global_start_date + step).strftime(FORMAT))
    query_pairs.append((query_url, n_pages))
    global_start_date += (step * 2)

In [None]:
# scrape stored queries for global date range
df = pd.DataFrame()
for url, n_pages in query_pairs:
    print(url)
    for page in range(n_pages):
        # add page number to url
        page_str = f'page={page}'
        scrape_url = url + QUERY_SEPERATOR + page_str
        
        # scrape table and place into temp dataframe
        scraped_df = scrape_query_url(scrape_url)
        
        # append scraped dataframe to global dataframe
        df = df.append(scraped_df)
df.to_csv('gun_violence_2022.csv', index=False)

In [None]:
# combine individual year dataframes 
year_range = [2018, 2022]
combined_df = pd.DataFrame()
for i in range(year_range[0], year_range[1]):
    file_path = f"data/gun_violence_{i}.csv"
    df = pd.read_csv(file_path)
    combined_df = combined_df.append(df)

In [None]:
combined_df.to_csv('gun_violence_2018-2021.csv', index=False)