# Dependencies and Setup

In [2]:
#!pip install sodapy

Collecting sodapy
  Downloading https://files.pythonhosted.org/packages/9e/74/95fb7d45bbe7f1de43caac45d7dd4807ef1e15881564a00eef489a3bb5c6/sodapy-2.1.0-py2.py3-none-any.whl
Installing collected packages: sodapy
Successfully installed sodapy-2.1.0


In [11]:
# make sure to install these packages before running:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
import datetime   # handle date times
import re         # regular expression package
from sqlalchemy import create_engine
import requests
from sodapy import Socrata

Select only the columns we are intersted in and select only accidents that happend in the Austin Metro Area, betweeen 2016 and 2019

* ID
* Severity
* Start_Time
* End_Time
* Start_Lat
* Start_Lng
* City
* County
* State
* Zipcode
* Country
* Visibility(mi) 
* Weather_Condition
* Precipitation(in)
* Sunrise_Sunset
* Civil_Twilight


Also create data frame for the metro cities in Texas



In [12]:
# Check if data has been pickled and if so load pickle
if os.path.exists('data/tx_metro_cities.pkl') :
    tx_metro_cities_df = pd.read_pickle('data/tx_metro_cities.pkl',compression='gzip')    
else:
# pickle only the attributes we are interested in, and compress
# pickle is much faster/smaller than csv
    df = pd.read_csv('data/US_Accidents_Dec19.csv',encoding='utf-8')
    columns_of_interest=[ 'ID','Severity','Start_Time','End_Time','Start_Lat', 'Start_Lng', 'City', 
        'County', 'State','Zipcode', 'Country','Visibility(mi)', 
        'Weather_Condition','Precipitation(in)','Sunrise_Sunset','Civil_Twilight'
    ]
    condition=df['State'] =='TX'
    txDF=df[condition][columns_of_interest]
    #create df for selected cities 
    tx_metro_cities =  ['Austin', 'Round Rock', 'Cedar Park',
                 'San Marcos', 'Georgetown', 'Pflugerville',
                 'Hutto', 'Leander'  ]
    condition = txDF['City'].isin(tx_metro_cities)
    tx_metro_cities_df = txDF[condition]    
    tx_metro_cities_df.to_pickle("data/tx_metro_cities.pkl",compression='gzip')
    



In [13]:
# add_datepart from fastai
# Date Feature Engineering
# will split data colum in to corresponding 'Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear'
# 'Hour', 'Minute'
def add_datepart(df, fldname, drop=True,time=False):
    fld = df[fldname]
    attributes = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear']
    if time: attributes = attributes + ['Hour', 'Minute']
    #, 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in attributes:
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)
        
add_datepart(tx_metro_cities_df,'Start_Time',False,True )        

In [14]:
# use name for DOW
def dow(df):
    days=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
    return days[df]

tx_metro_cities_df['Start_TimeDayofweek'] =tx_metro_cities_df['Start_TimeDayofweek'].apply(dow)

In [5]:
print(f"Number of Columns : {len(tx_metro_cities_df.columns)}")
print(f"Number of Rows : {len(tx_metro_cities_df)}")
print(tx_metro_cities_df.columns)

tx_metro_cities_df.head()


Number of Columns : 25
Number of Rows : 62609
Index(['ID', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng',
       'City', 'County', 'State', 'Zipcode', 'Country', 'Visibility(mi)',
       'Weather_Condition', 'Precipitation(in)', 'Sunrise_Sunset',
       'Civil_Twilight', 'Start_TimeYear', 'Start_TimeMonth', 'Start_TimeWeek',
       'Start_TimeDay', 'Start_TimeDayofweek', 'Start_TimeDayofyear',
       'Start_TimeHour', 'Start_TimeMinute', 'Start_TimeElapsed'],
      dtype='object')


Unnamed: 0,ID,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,City,County,State,Zipcode,...,Civil_Twilight,Start_TimeYear,Start_TimeMonth,Start_TimeWeek,Start_TimeDay,Start_TimeDayofweek,Start_TimeDayofyear,Start_TimeHour,Start_TimeMinute,Start_TimeElapsed
261007,A-261009,2,2016-11-30 16:03:54,2016-11-30 17:20:00,30.336502,-97.755646,Austin,Travis,TX,78731,...,Day,2016,11,48,30,Wednesday,335,16,3,1480521834
261008,A-261010,2,2016-11-30 16:32:18,2016-11-30 17:47:02,30.328165,-97.694305,Austin,Travis,TX,78752-2826,...,Day,2016,11,48,30,Wednesday,335,16,32,1480523538
261009,A-261011,2,2016-11-30 16:31:45,2016-11-30 17:46:34,30.326077,-97.692307,Austin,Travis,TX,78752,...,Day,2016,11,48,30,Wednesday,335,16,31,1480523505
261026,A-261028,2,2016-11-30 17:11:22,2016-11-30 17:41:09,30.332523,-97.686707,Austin,Travis,TX,78752,...,Day,2016,11,48,30,Wednesday,335,17,11,1480525882
261037,A-261039,2,2016-11-30 17:16:42,2016-11-30 17:46:32,30.292852,-97.747017,Austin,Travis,TX,78705,...,Day,2016,11,48,30,Wednesday,335,17,16,1480526202


# Austin Fatality Data

In [15]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.austintexas.gov", None)


# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.

results = client.get("ergh-7g8p", limit=2000)
results_df_2012 =pd.DataFrame.from_records(results)

# Convert to pandas DataFrame
results = client.get("vggi-9ddh", limit=2000)
results_df_2013 =pd.DataFrame.from_records(results)

results = client.get("gm9p-snyb", limit=2000)
results_df_2014=pd.DataFrame.from_records(results)

results = client.get("p658-umsa", limit=2000)
results_df_2015=pd.DataFrame.from_records(results)

results = client.get("tiqb-wv3c", limit=2000)
results_df_2016=pd.DataFrame.from_records(results)


results = client.get("ijds-pcyq", limit=2000)
results_df_2017=pd.DataFrame.from_records(results)

results = client.get("9jd4-zjmx", limit=2000)
results_df_2018 = pd.DataFrame.from_records(results)





In [10]:
#concatenate 2016-2018 fatality data
results_project= pd.concat([results_df_2016, results_df_2017, results_df_2018])


print(f"Number of Columns : {len(results_project.columns)}")
print(f"Number of Rows : {len(results_project)}")
print(results_project.columns)

results_project.head()


Number of Columns : 30
Number of Rows : 221
Index(['area', 'case_number', 'case_status', 'charge', 'coord_x', 'date',
       'day', 'dl_status', 'dl_status_incident',
       'failure_to_stop_and_render_aid', 'fatal_crash_number', 'ftsra', 'hour',
       'killed_driver_pass', 'location', 'month', 'number_of_fatalities',
       'ran_red_light_or_stop_sign', 'related', 'restraint_helmet',
       'restraint_or_helmet', 'restraint_type', 'speeding',
       'suspected_impairment', 'time', 'type', 'type_of_road', 'victim',
       'x_coord', 'y_coord'],
      dtype='object')


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,area,case_number,case_status,charge,coord_x,date,day,dl_status,dl_status_incident,failure_to_stop_and_render_aid,...,restraint_or_helmet,restraint_type,speeding,suspected_impairment,time,type,type_of_road,victim,x_coord,y_coord
0,AD,16-0061621,,FTSRA,-97.793878,2016-01-06T00:00:00.000,Wed,okay,,,...,,,N,PED,23:14,Pedestrian,local street,,,30.475234
1,HE,16-0140992,,,-97.717471,2016-01-14T00:00:00.000,Thu,okay,,,...,,,N,UNKNOWN,14:46,Pedestrian,high use roadway,,,30.231659
2,FR,16-0221932,,,-97.739337,2016-01-22T00:00:00.000,Fri,okay,,,...,unknown,,Y,DRIVER,23:29,Motor Vehicle,high use roadway,driver,,30.170877
3,ED,16-0301529,,,-97.679087,2016-01-30T00:00:00.000,Sat,okay,,,...,,,N,UNKNOWN,20:01,Pedestrian,high use roadway,,,30.402741
4,HE,16-0450263,,,-97.698371,2016-02-14T00:00:00.000,Sun,okay,,,...,no helmet,,Y,DRIVER,02:22,Motorcycle,local street,motorcyclist,,30.191989


In [21]:
results_project["coord_x"].dtype

dtype('float64')

In [20]:
results_project = results_project.astype({"coord_x": float, "x_coord":float, "y_coord": float})


# Save to SQLlite


In [8]:
engine = create_engine('sqlite:///data/AUSaccidents.db', echo=False)
tx_metro_cities_df.to_sql('austinAccidents',con=engine,if_exists='replace')


In [9]:
results_project.to_sql('austinFatalities',con=engine,if_exists='replace')
