In [24]:
import pandas as pd
import numpy as np
import pickle_funcs as pk
from datetime import datetime as dt

# Running the Data Query

I'm getting this data from the [NHTSA](https://www-fars.nhtsa.dot.gov/QueryTool/QuerySection/SelectYear.aspx)

I could just do a massive data dump, but I'm not sure this makes sense. It seems better to build on columns as needed. 

Apparently it isn't possible to do multiyear queries? This makes time series annoying but that isn't too bad (are time series really central to this sort of analysis?)

I realized that the case numbers are not unique, which means that it would be hard to append in new columns. May be able to match the case number, state number, and some other identifier if needing to append a new column, but I don't think I'm going to do this at the moment. 

So I can just run another query if I want to add columns.

Process for pulling more data:

* Select a year from the dropdown in the above link
* Option 1 in the second screen probably makes the most sense 
* Select desired fields from either table 
* Select "Case Listing"
* Make sure that Case Number and State are selected for the identifier fields
* Select output stats
* Name the export
* Preview data
* Export via txt (gets a tab seperated value)  



# Creating a DataFrame

In [25]:
# Read table with date and time columns as a string and index col set to first col
data = pd.read_table('data.txt', dtype={'accdate':str, 'acctime':str}, index_col=0)

In [26]:
data.describe()



Unnamed: 0,statenum,casenum,speeding,Unnamed: 8
count,54559.0,54559.0,54559.0,0.0
mean,27.251636,672.195403,0.258472,
std,16.144258,708.956005,0.437799,
min,1.0,1.0,0.0,
25%,12.0,179.0,0.0,
50%,27.0,427.0,0.0,
75%,42.0,851.0,1.0,
max,56.0,3195.0,1.0,


In [27]:
del data['Unnamed: 8'] # there are tabs at the end of these files, just have to delete a column

In [28]:
data.head()

Unnamed: 0_level_0,statenum,casenum,accdate,acctime,speeding,alcres,drugres1
Obs.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,6,1397,9022015,1617,1,996,0
2,6,1734,8012015,852,0,996,0
3,27,263,9212015,1225,0,996,0
4,30,113,8102015,21,1,262,605
5,37,47,1022015,1905,0,0,0


In [29]:
type(data)

pandas.core.frame.DataFrame

In [42]:
data.dtypes

statenum     int64
casenum      int64
accdate     object
acctime     object
speeding     int64
alcres      object
drugres1    object
dtype: object

# Convert Dates to Python Datetime Object

In [31]:

sample_date = '09022015'
sample_time = '1617'

def convert_date(date_string, time_string):
    """Takes time and date as string and converts to datetime object"""
    string = '%s : %s' % (date_string, time_string)
    return dt.strptime(string, '%m%d%Y : %H%M')
    
    
convert_date(sample_date, sample_time)

datetime.datetime(2015, 9, 2, 16, 17)

In [41]:
data.head()

Unnamed: 0_level_0,statenum,casenum,accdate,acctime,speeding,alcres,drugres1
Obs.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,6,1397,2015-09-02 16:17:00,1617,1,996,0
2,6,1734,2015-08-01 08:52:00,852,0,996,0
3,27,263,2015-09-21 12:25:00,1225,0,996,0
4,30,113,2015-08-10 00:21:00,21,1,262,605
5,37,47,2015-01-02 19:05:00,1905,0,0,0


In [34]:
len(data.index)

54559

In [35]:
count = 0
for item in data['acctime']:
    if item == '9999':
        count += 1
        
count

331

In [36]:
print('the new length should be:', 54559 - 331)

the new length should be: 54228


In [37]:
data = data.drop(data[data.acctime == '9999'].index)

In [38]:
len(data.index)

54228

In [39]:
# less than ten of the time data points have a '99' on the end
# these are replaced by setting the hour to '00'
for row in range(len(data.index)):
    time = data.iloc[row, 3]
    if int(time[2]) > 5:
        time = time[:2] + '00'
        data.iloc[row, 3] = time

In [40]:
# the dates are all the same length
# the times are all the same length

# the dates are all the same length
# the times are all the same length


for row in range(len(data.index)):
    date = data.iloc[row, 2]
    time = data.iloc[row, 3]

    datetime = convert_date(date, time)
    data.iloc[row, 2] = datetime



# for row in range(len(data.index)):
#     date = data.iloc[row, 2]
#     time = data.iloc[row, 3]
#     try:
#         datetime = convert_date(date, time)
#         data.iloc[row, 2] = datetime
#     except ValueError:
#         print(date)
#         print(time)



# Write Object to File

In [43]:
# Write this to a pickle file, creating the main data file. 
# Can't test to see if the dataframes are exactly the same because of issues with Pandas

pk.pickle_object(data, 'data', test=False)

In [44]:
# Load the pickled dataframe to test
new_data = pk.unpickle_object('data')

In [45]:
# vector operation on dataframes, return array with True in cell if don't match
# This cell should return a zero
test = new_data.values != data.values
test.sum()

0