Skip to content
This repository has been archived by the owner on Jun 14, 2019. It is now read-only.

Commit

Permalink
Merge pull request #1 from davewalk/master
Browse files Browse the repository at this point in the history
Adding scripts for creating a Sqlite database of the data
  • Loading branch information
fulldecent committed Apr 27, 2015
2 parents 5e8ed7f + 1d883cc commit 7c5ab9e
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 0 deletions.
66 changes: 66 additions & 0 deletions sqlite/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
This directory scripts for inserting the traffic violations data into a Sqlite database for SQL querying. The actual file database is over the GitHub file size limit of 100 MB, but can be downloaded [here](http://s3.amazonaws.com/dwalk/phila_open_data/phila_traffic.db). The MD5 sum of the database is `fb1e6ec24107b8ed6b0c17e5c814d9c7`.

### Requirements
* [Sqlite](https://sqlite.org)
* If running the insert script, you'll need [Python](http://python.org)

### Instructions
Load the database in Sqlite with:
`sqlite3 philly_traffic.db`

To populate another database besides the one included here:

1. From the command line: `sqlite3 [your database name]`
2. At the Sqlite prompt: `.read create_table.sql`
3. Quit sqlite: `.exit`
4. Run the Python insert script: `python insert_data.py`. Change the `DATABASE_NAME`, if necessary, at the top of that file.

### Verification

#### Schema
```sqlite> .schema violations```

| column | datetype | constraints |
|----------------------|-------------|-------------|
| citation_id | TEXT | PRIMARY KEY |
| filed_date | DATE | |
| issue_date | DATE | NOT NULL |
| violation_code | TEXT | NOT NULL |
| violation_title | TEXT | NOT NULL |
| violation_location | TEXT | NOT NULL |
| agency | TEXT | NOT NULL |
| defendent_lastname | TEXT | NOT NULL |
| defendent_firstname | TEXT | NOT NULL |
| defendent_city | TEXT | NOT NULL |
| defendent_state | VARCHAR(2) | NOT NULL |
| defendent_zip | INTEGER(5) | NOT NULL |
| defendent_dob | DATE | |
| gender | VARCHAR(1) | NOT NULL |
| amount_due | INTEGER(3) | NOT NULL |
| amount_paid | INTEGER(3) | NOT NULL |
| closing_disposition | TEXT | NOT NULL |
| disposition_date | DATE | |
| judge | TEXT | NOT NULL |
| owner_lastname | TEXT | NOT NULL |
| owner_firstname | TEXT | NOT NULL |
| owner_city | TEXT | NOT NULL |
| owner_state | VARCHAR(2) | NOT NULL |
| owner_zip | INTEGER(5) | NOT NULL |
| hearing_datetime | DATETIME | |
| defendent_zip | INTEGER(5) | |

Record counts by year

`sqlite> select strftime("%Y", issue_date) as year, count(*) as citations from violations GROUP BY year;`

| Year | Citations |
|------|-----------|
| 2006 | 116227 |
| 2007 | 106843 |
| 2008 | 114647 |
| 2009 | 98056 |
| 2010 | 84087 |
| 2011 | 108098 |
| 2012 | 166805 |
| 2013 | 155208 |
| 2014 | 74332 |
29 changes: 29 additions & 0 deletions sqlite/create_table.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
DROP TABLE IF EXISTS violations;

CREATE TABLE violations (
citation_id TEXT PRIMARY KEY,
filed_date DATE,
issue_date DATE NOT NULL,
violation_code TEXT NOT NULL,
violation_title TEXT NOT NULL,
violation_location TEXT NOT NULL,
agency TEXT NOT NULL,
defendent_lastname TEXT NOT NULL,
defendent_firstname TEXT NOT NULL,
defendent_city TEXT NOT NULL,
defendent_state VARCHAR(2) NOT NULL,
defendent_zip INTEGER(5) NOT NULL,
defendent_dob DATE,
gender VARCHAR(1) NOT NULL,
amount_due INTEGER(3) NOT NULL,
amount_paid INTEGER(3) NOT NULL,
closing_disposition TEXT NOT NULL,
disposition_date DATE,
judge TEXT NOT NULL,
owner_lastname TEXT NOT NULL,
owner_firstname TEXT NOT NULL,
owner_city TEXT NOT NULL,
owner_state VARCHAR(2) NOT NULL,
owner_zip INTEGER(5) NOT NULL,
hearing_datetime DATETIME
);
72 changes: 72 additions & 0 deletions sqlite/insert_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import csv
import glob
import time
import sqlite3
from datetime import datetime

DATABASE_NAME = 'phila_traffic2.db'
DATE_FORMAT = '%m/%d/%Y'
DATETIME_FORMAT = '%m/%d/%Y %I:%M%p'

def convert_date(date_str):
try:
result = datetime.strptime(date_str, DATE_FORMAT)
return result
except:
return None

# I believe that this will save the times according to your machine's
# timezone, so if you aren't in EST you'll have to adjust
def convert_datetime(datetime_str):
try:
result = datetime.strptime(datetime_str, DATETIME_FORMAT)
return result
except:
return None

files = glob.glob('../citations*.tsv')

for file in files:
filename = os.path.basename(file)
if filename != 'citationsHeaders.tsv':
with open(file, 'rb') as in_file:
print 'Inserting {}...'.format(filename)
conn = sqlite3.connect(DATABASE_NAME)
conn.text_factory = str
cur = conn.cursor()
violations = csv.reader(in_file, delimiter='\t')
next(violations, None) # skip the header

i = 0

for v in violations:
filed_date = convert_date(v[1])
issue_date = convert_date(v[2])
defendent_dob = convert_date(v[12])
disposition_date = convert_date(v[17])
h_dt = ''
if len(v[25].strip()) == 0:
h_dt = convert_date(v[24])
else:
h_t = v[25].strip()
h_t = h_t.replace('P', 'PM')
h_t = h_t.replace('A', 'AM')
h_dt = v[24] + ' ' + h_t
h_dt = convert_datetime(h_dt)
try:
cur.execute('INSERT INTO violations values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', \
(v[0], filed_date, issue_date, v[3], v[4], v[5], \
v[6], v[7], v[8], v[9], v[10], v[11], defendent_dob, \
v[13], v[14], v[15], v[16], disposition_date, v[18], \
v[19], v[20], v[21], v[22], v[23], h_dt))
except sqlite3.IntegrityError as err:
print err
pass

i = i + 1
if i % 25000 == 0:
print '{} records inserted so far...'.format(i)
conn.commit()
conn.close()
print 'Done inserting {}'.format(filename)

0 comments on commit 7c5ab9e

Please sign in to comment.