In [1]:
# Dependencies and Setup
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
import datetime   # handle date times
import re         # regular expression package

import folium
from folium.plugins import HeatMap

import dtale # data Frame visualization

Select only the columns we are intersted in and select only accidents that happend in the state of Texas

* ID
* Severity
* Start_Time
* End_Time
* Start_Lat
* Start_Lng
* City
* County
* State
* Zipcode
* Country
* Visibility(mi) 
* Weather_Condition
* Precipitation(in)
* Sunrise_Sunset
* Civil_Twilight


Also create data frame for the metro cities in Texas



In [None]:
# Check if data has been pickled and if so load pickle
if os.path.exists('data/txDF.pkl') :
    txDF=pd.read_pickle('data/txDF.pkl',compression='gzip')
    tx_metro_cities_df = pd.read_pickle('data/tx_metro_cities.pkl',compression='gzip')    
else:
# pickle only the attributes we are interested in, and compress
# pickle is much faster/smaller than csv
    df = pd.read_csv('data/US_Accidents_Dec19.csv',encoding='utf-8')
    columns_of_interest=[
        'ID','Severity','Start_Time','End_Time','Start_Lat', 'Start_Lng', 'City', 
        'County', 'State','Zipcode', 'Country','Visibility(mi)', 
        'Weather_Condition','Precipitation(in)','Sunrise_Sunset','Civil_Twilight'
    ]
    condition=df['State'] =='TX'
    txDF=df[condition][columns_of_interest]
    #create df for selected cities 
    tx_metro_cities =  ['Austin', 'Round Rock', 'Cedar Park',
                 'San Marcos', 'Georgetown', 'Pflugerville',
                 'Hutto', 'Buda', 'Kyle', 'Leander',
                 'Dallas', 'Fort Worth', 'Arlington', 
                 'Plano','Garland', 'Irving', 'McKinney', 
                 'Frisco','Denton','Richardson', 'Allen','El Paso',
                 'Houston', 'The Woodlands', 'Sugar Land',
                 'Baytown', 'Conroe','San Antonio', 'New Braunfels', 
                 'Schertz', 'Seguin']
    condition = txDF['City'].isin(tx_metro_cities)
    tx_metro_cities_df = txDF[condition]    
    txDF.to_pickle("data/txDF.pkl",compression='gzip')
    tx_metro_cities_df.to_pickle("data/tx_metro_cities.pkl",compression='gzip')

In [None]:
# add_datepart from fastai
# Date Feature Engineering
# will split data colum in to corresponding 'Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear'
# 'Hour', 'Minute'
def add_datepart(df, fldname, drop=True,time=False):
    fld = df[fldname]
    attributes = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear']
    if time: attributes = attributes + ['Hour', 'Minute']
    #, 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in attributes:
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)
        
add_datepart(tx_metro_cities_df,'Start_Time',False,True )        

In [None]:
# use name for DOW
def dow(df):
    days=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
    return days[df]

tx_metro_cities_df['Start_TimeDayofweek'] =tx_metro_cities_df['Start_TimeDayofweek'].apply(dow)

In [None]:
print(f"Number of Columns : {len(txDF.columns)}")
print(f"Number of Rows : {len(txDF)}")
tx_metro_cities_df.columns

In [None]:
print(f"Number of Columns : {len(tx_metro_cities_df.columns)}")
print(f"Number of Rows : {len(tx_metro_cities_df)}")


In [None]:
# get Fatality Date

In [3]:
# make sure to install these packages before running:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
import numpy as np
import requests
import gmaps
import json 
import os
from sodapy import Socrata

In [6]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.austintexas.gov", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.austintexas.gov,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.

results = client.get("ergh-7g8p", limit=2000)
results_df_2012 =pd.DataFrame.from_records(results)

# Convert to pandas DataFrame
results = client.get("vggi-9ddh", limit=2000)
results_df_2013 =pd.DataFrame.from_records(results)

results = client.get("gm9p-snyb", limit=2000)
results_df_2014=pd.DataFrame.from_records(results)

results = client.get("p658-umsa", limit=2000)
results_df_2015=pd.DataFrame.from_records(results)

results = client.get("tiqb-wv3c", limit=2000)
results_df_2016=pd.DataFrame.from_records(results)


results = client.get("ijds-pcyq", limit=2000)
results_df_2017=pd.DataFrame.from_records(results)

results = client.get("9jd4-zjmx", limit=2000)
results_df_2018 = pd.DataFrame.from_records(results)





In [7]:
results_df_2017

Unnamed: 0,type,fatal_crash_number,number_of_fatalities,case_number,location,area,date,month,day,hour,...,killed_driver_pass,speeding,ran_red_light_or_stop_sign,dl_status_incident,suspected_impairment,restraint_helmet,type_of_road,failure_to_stop_and_render_aid,x_coord,y_coord
0,Motor Vehicle,1,1,17-0030106,8200 blk N. Lamar Blvd,ED,2017-01-03T00:00:00.000,Jan,Tue,1,...,driver,Y,N,okay,DRIVER,seatbelt worn,high use roadway,N,-97.709153,30.350809
1,Bicycle,2,1,17-0080877,Research Blvd SVRD NB/Riata Trace Pkwy,AD,2017-01-08T00:00:00.000,Jan,Sun,15,...,,N,Y,suspended,NONE,helmet worn,other highway,N,-97.753497,30.426752
2,Motor Vehicle,3,1,17-0071308,E US Hwy 290 WB Svrd/Johnny Morris,CH,2017-01-07T00:00:00.000,Jan,Sat,21,...,other driver,Y,N,no DL,DRIVER,unknown,other highway,Y,-97.623275,30.33123
3,Motor Vehicle,4,1,17-0150225,13500 N US 183 NB,AD,2017-01-15T00:00:00.000,Jan,Sun,2,...,passenger,N,N,no DL,BOTH Drivers,no seatbelt,other highway,N,-97.790548,30.449598
4,Pedestrian,5,1,17-0110340,5900 blk N IH35,ID,2017-01-11T00:00:00.000,Jan,Wed,6,...,,N,N,okay,PED,,IH35,N,-97.706646,30.320762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,Motorcycle,67,1,17-3380841,8300 Research Blvd Svrd,ID,2017-12-04T00:00:00.000,DEC,Mon,13,...,driver,Y,N,suspended,Driver,helmet worn,other highway,n,-97.715069,30.355643
67,Motorcycle,68,1,17-3390056,800 W Cesar Chavez,GE,2017-12-05T00:00:00.000,DEC,Tue,0,...,Driver,Y,N,suspended,Driver,no helmet,local,n,-97.754409,30.266275
68,Motor Vehicle,69,1,17-3410022,3302 Northeast,ID,2017-12-07T00:00:00.000,DEC,Thu,0,...,passenger,Y,N,no DL,DRIVER,no seatbelt,local,n,-97.67093,30.308889
69,Motorcycle,70,1,17-3460912,8400 Research Blvd SB,ID,2017-12-12T00:00:00.000,DEC,Tue,14,...,driver,unk,n,okay,NONE,no helmet,other highway,n,-97.715831,30.356976
