In [None]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd

In [None]:
def get_soup(city):
    """Returns BeautifulSoup object for each set of links"""
    r = requests.get('https://spotcrime.com/' + city + 'daily')
    r2 = requests.get('https://spotcrime.com/' + city + 'daily/more')
    soup1 = BeautifulSoup(r.text, 'html.parser')
    soup2 = BeautifulSoup(r2.text, 'html.parser')
    
    return soup1, soup2

In [None]:
def get_links(soups):
    """Pulls all the links from each BeautifulSoup object into a single list"""
    links = []
    for dates in soups[0].find_all('ol', class_='list-unstyled'):
        for link in dates.find_all('a'):
            links.append(link['href'])
    for dates in soups[1].find_all('ol', class_='list-unstyled'):
        for link in dates.find_all('a'):
            links.append(link['href'])
    return links

In [None]:
def crime_df(links, base_url):
    """Loads each link and downloads the table of crimes, storing it in a list of lists
    Returns a dataframe
    """
    data = []
    for i, link in enumerate(links):
        print(i, link)
        try:
            r = requests.get(base_url + link)
        except:
            print('uh oh, timeout')
            time.sleep(10)
            r = requests.get(base_url+link)

        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.find('table')
        if table == None:
            print('no table, skipping')
            continue
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            crime = [ele for ele in cols if ele]
            if len(crime) == 0:
                continue
            if len(crime) == 4:
                crime = ['A'] + crime
            data.append(crime)
    return pd.DataFrame(data, columns=['A', 'Crime', 'Time', 'Address', 'Details'])

In [None]:
base_url = 'https://spotcrime.com'
cities = [
    'mi/detroit/',
    'mo/st.+louis/',
    'md/baltimore/',
    'oh/toledo/',
    'ga/albany/',
    'mi/flint/',
    'tn/memphis/',
    'pa/philadelphia/',
]

In [None]:
# Downloads and saves each cities crime information to a gzipped csv file
for city in cities:
    soup1, soup2 = get_soup(city)
    
    links = get_links((soup1, soup2))
    df = crime_df(links, base_url)
    df = df.drop(['A', 'Address', 'Details'], axis=1)
    df.to_csv('./crime_{}_{}.csv.gz'.format(city[3:-1], city[:2]), compression='gzip', index=False)