This repository has been archived by the owner on Jun 14, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from davewalk/master
Adding scripts for creating a Sqlite database of the data
- Loading branch information
Showing
3 changed files
with
167 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
This directory scripts for inserting the traffic violations data into a Sqlite database for SQL querying. The actual file database is over the GitHub file size limit of 100 MB, but can be downloaded [here](http://s3.amazonaws.com/dwalk/phila_open_data/phila_traffic.db). The MD5 sum of the database is `fb1e6ec24107b8ed6b0c17e5c814d9c7`. | ||
|
||
### Requirements | ||
* [Sqlite](https://sqlite.org) | ||
* If running the insert script, you'll need [Python](http://python.org) | ||
|
||
### Instructions | ||
Load the database in Sqlite with: | ||
`sqlite3 philly_traffic.db` | ||
|
||
To populate another database besides the one included here: | ||
|
||
1. From the command line: `sqlite3 [your database name]` | ||
2. At the Sqlite prompt: `.read create_table.sql` | ||
3. Quit sqlite: `.exit` | ||
4. Run the Python insert script: `python insert_data.py`. Change the `DATABASE_NAME`, if necessary, at the top of that file. | ||
|
||
### Verification | ||
|
||
#### Schema | ||
```sqlite> .schema violations``` | ||
|
||
| column | datetype | constraints | | ||
|----------------------|-------------|-------------| | ||
| citation_id | TEXT | PRIMARY KEY | | ||
| filed_date | DATE | | | ||
| issue_date | DATE | NOT NULL | | ||
| violation_code | TEXT | NOT NULL | | ||
| violation_title | TEXT | NOT NULL | | ||
| violation_location | TEXT | NOT NULL | | ||
| agency | TEXT | NOT NULL | | ||
| defendent_lastname | TEXT | NOT NULL | | ||
| defendent_firstname | TEXT | NOT NULL | | ||
| defendent_city | TEXT | NOT NULL | | ||
| defendent_state | VARCHAR(2) | NOT NULL | | ||
| defendent_zip | INTEGER(5) | NOT NULL | | ||
| defendent_dob | DATE | | | ||
| gender | VARCHAR(1) | NOT NULL | | ||
| amount_due | INTEGER(3) | NOT NULL | | ||
| amount_paid | INTEGER(3) | NOT NULL | | ||
| closing_disposition | TEXT | NOT NULL | | ||
| disposition_date | DATE | | | ||
| judge | TEXT | NOT NULL | | ||
| owner_lastname | TEXT | NOT NULL | | ||
| owner_firstname | TEXT | NOT NULL | | ||
| owner_city | TEXT | NOT NULL | | ||
| owner_state | VARCHAR(2) | NOT NULL | | ||
| owner_zip | INTEGER(5) | NOT NULL | | ||
| hearing_datetime | DATETIME | | | ||
| defendent_zip | INTEGER(5) | | | ||
|
||
Record counts by year | ||
|
||
`sqlite> select strftime("%Y", issue_date) as year, count(*) as citations from violations GROUP BY year;` | ||
|
||
| Year | Citations | | ||
|------|-----------| | ||
| 2006 | 116227 | | ||
| 2007 | 106843 | | ||
| 2008 | 114647 | | ||
| 2009 | 98056 | | ||
| 2010 | 84087 | | ||
| 2011 | 108098 | | ||
| 2012 | 166805 | | ||
| 2013 | 155208 | | ||
| 2014 | 74332 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
DROP TABLE IF EXISTS violations; | ||
|
||
CREATE TABLE violations ( | ||
citation_id TEXT PRIMARY KEY, | ||
filed_date DATE, | ||
issue_date DATE NOT NULL, | ||
violation_code TEXT NOT NULL, | ||
violation_title TEXT NOT NULL, | ||
violation_location TEXT NOT NULL, | ||
agency TEXT NOT NULL, | ||
defendent_lastname TEXT NOT NULL, | ||
defendent_firstname TEXT NOT NULL, | ||
defendent_city TEXT NOT NULL, | ||
defendent_state VARCHAR(2) NOT NULL, | ||
defendent_zip INTEGER(5) NOT NULL, | ||
defendent_dob DATE, | ||
gender VARCHAR(1) NOT NULL, | ||
amount_due INTEGER(3) NOT NULL, | ||
amount_paid INTEGER(3) NOT NULL, | ||
closing_disposition TEXT NOT NULL, | ||
disposition_date DATE, | ||
judge TEXT NOT NULL, | ||
owner_lastname TEXT NOT NULL, | ||
owner_firstname TEXT NOT NULL, | ||
owner_city TEXT NOT NULL, | ||
owner_state VARCHAR(2) NOT NULL, | ||
owner_zip INTEGER(5) NOT NULL, | ||
hearing_datetime DATETIME | ||
); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import os | ||
import csv | ||
import glob | ||
import time | ||
import sqlite3 | ||
from datetime import datetime | ||
|
||
DATABASE_NAME = 'phila_traffic2.db' | ||
DATE_FORMAT = '%m/%d/%Y' | ||
DATETIME_FORMAT = '%m/%d/%Y %I:%M%p' | ||
|
||
def convert_date(date_str): | ||
try: | ||
result = datetime.strptime(date_str, DATE_FORMAT) | ||
return result | ||
except: | ||
return None | ||
|
||
# I believe that this will save the times according to your machine's | ||
# timezone, so if you aren't in EST you'll have to adjust | ||
def convert_datetime(datetime_str): | ||
try: | ||
result = datetime.strptime(datetime_str, DATETIME_FORMAT) | ||
return result | ||
except: | ||
return None | ||
|
||
files = glob.glob('../citations*.tsv') | ||
|
||
for file in files: | ||
filename = os.path.basename(file) | ||
if filename != 'citationsHeaders.tsv': | ||
with open(file, 'rb') as in_file: | ||
print 'Inserting {}...'.format(filename) | ||
conn = sqlite3.connect(DATABASE_NAME) | ||
conn.text_factory = str | ||
cur = conn.cursor() | ||
violations = csv.reader(in_file, delimiter='\t') | ||
next(violations, None) # skip the header | ||
|
||
i = 0 | ||
|
||
for v in violations: | ||
filed_date = convert_date(v[1]) | ||
issue_date = convert_date(v[2]) | ||
defendent_dob = convert_date(v[12]) | ||
disposition_date = convert_date(v[17]) | ||
h_dt = '' | ||
if len(v[25].strip()) == 0: | ||
h_dt = convert_date(v[24]) | ||
else: | ||
h_t = v[25].strip() | ||
h_t = h_t.replace('P', 'PM') | ||
h_t = h_t.replace('A', 'AM') | ||
h_dt = v[24] + ' ' + h_t | ||
h_dt = convert_datetime(h_dt) | ||
try: | ||
cur.execute('INSERT INTO violations values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', \ | ||
(v[0], filed_date, issue_date, v[3], v[4], v[5], \ | ||
v[6], v[7], v[8], v[9], v[10], v[11], defendent_dob, \ | ||
v[13], v[14], v[15], v[16], disposition_date, v[18], \ | ||
v[19], v[20], v[21], v[22], v[23], h_dt)) | ||
except sqlite3.IntegrityError as err: | ||
print err | ||
pass | ||
|
||
i = i + 1 | ||
if i % 25000 == 0: | ||
print '{} records inserted so far...'.format(i) | ||
conn.commit() | ||
conn.close() | ||
print 'Done inserting {}'.format(filename) |