In [1]:
import os
import math
import pandas as pd
import requests
from dateutil import parser

In [2]:
import sys
sys.path.append("..")

from backend.api import create_app, db
from backend.database import UseOfForce, Incident, Officer

app = create_app("development")

In [3]:
isnan = lambda x: isinstance(x, float) and math.isnan(x)
nan_to_none = lambda x: None if isnan(x) else x
parse_date = lambda s: parser.parse(s) if isinstance(s, str) else None

In [4]:
# Load the full database produced by flask scrape and cache the dataframe for faster loads

scraper_output = '../excel_outputs'
pkl_cache = os.path.join(scraper_output, 'full_database.pkl.zip')
spreadsheet = os.path.join(scraper_output,'full_database.xlsx')

if os.path.exists(pkl_cache):
    data = pd.read_pickle(pkl_cache)
else:
    excel_sheets = pd.read_excel(spreadsheet, sheet_name=None)
    data = pd.concat(excel_sheets.values(), axis=1)
    data.to_pickle(pkl_cache)

In [5]:
data.iloc[2]

columns Index(['Unnamed: 0', 'record_type', 'attachments_available', 'record_id',
       'data_source_id', 'Unnamed: 0', 'description', 'death_location_city',
       'death_location_county', 'death_location_state',
       'death_location_street_address', 'department_present', 'perpetrator',
       'incident_date', 'incident_time', 'record_id', 'Unnamed: 0',
       'victim_age', 'victim_name_full', 'victim_race', 'victim_sex',
       'Unnamed: 0', 'death_manner'],
      dtype='object')


Unnamed: 0                                                                      22
record_type                                                           news article
attachments_available                                                          NaN
record_id                                                                      NaN
data_source_id                                                                 mpv
Unnamed: 0                                                                      22
description                      Police responded to reports of a disturbance a...
death_location_city                                                     Huntsville
death_location_county                                                       Walker
death_location_state                                                            TX
death_location_street_address                                         5300 FM 1374
department_present                                  Walker County Sheriff's Office
perp

In [6]:
def orm_location(row):
    fields = list(
        filter(
            None,
            [
                row.death_location_street_address,
                row.death_location_city,
                row.death_location_county,
                row.death_location_state,
            ],
        )
    )

    return None if len(fields) == 0 else " ".join(fields)


def row_to_orm(row):
    data = {}
    row = row._make(map(nan_to_none, row))
    perp = row.perpetrator
    dept = row.department_present
    incident_date = parse_date(row.incident_date)
    incident_time = parse_date(row.incident_time)
    source = row.data_source_id
    use_of_force = row.death_manner
    location = orm_location(row)

    if location:
        data["location"] = location
    if incident_date or incident_time:
        t = incident_date if incident_date else incident_time
        if incident_time:
            t = t.replace(
                hour=incident_time.hour,
                minute=incident_time.minute,
                second=incident_time.second,
            )
        data["time_of_incident"] = t.isoformat()
    if perp or dept:
        data["officers"] = [
            Officer(
                first_name=dept if dept else "Unknown",
                last_name=perp if perp else "Unknown",
            )
        ]
    if row.description:
        data["description"] = row.description
    if use_of_force:
        data["use_of_force"] = [UseOfForce(item=use_of_force)]
    if source:
        data["source"] = source

    return Incident(**data)


def create_bulk(incidents, chunk_size=1000):
    with app.app_context():
        for chunk in range(0, len(incidents), chunk_size):
            db.session.add_all(incidents[chunk : chunk + chunk_size])
            db.session.flush()
        db.session.commit()


def get_rows_to_create():
    return [row_to_orm(row) for row in data.itertuples(index=False)]


In [7]:
orm_incidents = get_rows_to_create()

In [None]:
with app.app_context():
    create_bulk(orm_incidents)
    print(db.session.query(Incident).count())

In [39]:
origin = "http://localhost:5000"
route = lambda path: os.path.join(origin, "api/v1", path)


def authenticate():
    res = requests.post(
        route("auth/login"),
        json={"email": "test@example.com", "password": "password"},
    )
    assert res.status_code == 200
    return res.json()["access_token"]


def search(query):
    token = authenticate()
    res = requests.get(
        route("incidents/search"),
        json=query,
        headers={"Authorization": f"Bearer {token}"},
    )
    assert res.status_code == 200
    return res.json()['results']


In [43]:
search({"description": "shot", "location": "tx"})

[{'description': 'Police responded to reports of a suicidal individual who was reportedly armed. Upon arrival, police attempted to contact the individual inside. Police claim the individual eventually exited the home pointing a shotgun at officers. Police shto and killed the individual.',
  'id': 1,
  'location': 'McKinney Collin TX',
  'officers': [{'first_name': 'McKinney Police Department',
    'id': 1,
    'incident_id': 1,
    'last_name': 'Unknown'}],
  'source': 'mpv',
  'use_of_force': [{'id': 1, 'incident_id': 1, 'item': 'Gunshot'}]},
 {'description': 'Police responded to reports of a disturbance and a person with a weapon. Upon arrival, police spoke to the 911 caller and determined a supposedly armed individual was within the home. Police claim the individual shot at deputies from a window. Deputies returned fire, killing the person.',
  'id': 3,
  'location': '5300 FM 1374 Huntsville Walker TX',
  'officers': [{'first_name': "Walker County Sheriff's Office",
    'id': 3,
   