## D. Kinney - DSC 530 Final Project - Spring 2019
### Analysis of Vehicle Fatalities in 2017

In [None]:
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import thinkstats2
import thinkplot
from urllib.request import urlopen
import zipfile
%matplotlib inline

In [None]:
# Retrieve vehicle fatalities data for 2017 from NHTSA website
out = BytesIO()
out.write(urlopen('ftp://ftp.nhtsa.dot.gov/FARS/2017/National/FARS2017NationalCSV.zip').read())
zipped = zipfile.ZipFile(out)
zipped.extract('accident.csv')
df_fatalities_2017 = pd.read_csv('accident.csv')
zipped.extract('person.csv')
df_person_2017 = pd.read_csv('person.csv')

In [None]:
# Functions
def PlotHist(series, varName, color, bins):
    n, bins, patches = plt.hist(series, bins, facecolor=color, alpha=0.75)
    plt.xlabel(varName)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()
    
def Descriptives(series):
    mean = round(series.mean(),2)
    spread = round(series.var(),2)
    std = round(series.std(),2)
    return mean, spread, std

In [None]:
# Data Cleaning
df_fatalities_2017 = df_fatalities_2017.replace([98, 99], np.nan)
df_person_2017 = df_person_2017.replace([998, 999], np.nan)
df_person_2017 = df_person_2017.replace([99, 99], np.nan)
df_person_2017 = df_person_2017.replace([9998, 9999], np.nan)
df_person_2017 = df_person_2017[np.isfinite(df_person_2017.MOD_YEAR)]

In [None]:
# Histograms
PlotHist(df_fatalities_2017['MONTH'], "Month of Accident", 'purple', 12)
PlotHist(df_fatalities_2017['DAY'], "Day of Accident", 'blue', 31)
PlotHist(df_fatalities_2017['DAY_WEEK'], "Day of Week of Accident", 'lightblue', 7)
PlotHist(df_fatalities_2017['HOUR'], "Hour of Accident", 'green', 24)
PlotHist(df_fatalities_2017['ROUTE'], "Type of Road", 'yellow', 9)
PlotHist(df_fatalities_2017['MAN_COLL'], "Manner of Collision", 'orange', 12)
PlotHist(df_fatalities_2017['LGT_COND'], "Light Conditions", 'red', 9)
PlotHist(df_fatalities_2017['WEATHER'], "Weather Conditions", 'gray', 12)


In [None]:
# Descriptives
month = Descriptives(df_fatalities_2017['MONTH'])
day = Descriptives(df_fatalities_2017['DAY'])
dow = Descriptives(df_fatalities_2017['DAY_WEEK'])
hour = Descriptives(df_fatalities_2017['HOUR'])
route = Descriptives(df_fatalities_2017['ROUTE'])
manner = Descriptives(df_fatalities_2017['MAN_COLL'])
light = Descriptives(df_fatalities_2017['LGT_COND'])
weather = Descriptives(df_fatalities_2017['WEATHER'])

In [None]:
print("Variable\tMean\tSpread\tStd Dev")
print("Month\t\t{}\t{}\t{}".format(month[0], month[1], month[2]))
print("Day\t\t{}\t{}\t{}".format(day[0], day[1], day[2]))
print("Day of Week\t{}\t{}\t{}".format(dow[0], dow[1], dow[2]))
print("Hour\t\t{}\t{}\t{}".format(hour[0], hour[1], hour[2]))
print("Road Type\t{}\t{}\t{}".format(route[0], route[1], route[2]))
print("Manner\t\t{}\t{}\t{}".format(manner[0], manner[1], manner[2]))
print("Light\t\t{}\t{}\t{}".format(light[0], light[1], light[2]))
print("Weather\t\t{}\t{}\t{}".format(weather[0], weather[1], weather[2]))

In [None]:
# PMF
df_midweek = df_fatalities_2017[df_fatalities_2017.DAY_WEEK == 3]
df_saturday = df_fatalities_2017[df_fatalities_2017.DAY_WEEK == 7]
midweek_pmf = thinkstats2.Pmf(df_midweek['HOUR'])
saturday_pmf = thinkstats2.Pmf(df_saturday['HOUR'])

In [None]:
width = 1
thinkplot.PrePlot(2, cols=2)
thinkplot.Pmf(midweek_pmf, align='right', width=width, label = 'Wednesday', color = 'green')
thinkplot.Pmf(saturday_pmf, align='left', width=width, label = 'Saturday', color = 'red')
thinkplot.Config(xlabel="Hour",ylabel="PMF")

In [None]:
# CDF
cdf = thinkstats2.Cdf(df_fatalities_2017.HOUR, label='Hour')

In [None]:
thinkplot.Cdf(cdf)
thinkplot.Show(xlabel='Hour', ylabel='CDF')

In [None]:
# Create new variable--Time--HH:MM of accident
# df_person_2017.drop('Time', axis=1)
df_person_2017['HOUR'] = df_person_2017['HOUR'].astype(int, errors='ignore')
df_person_2017['MINUTE'] = df_person_2017['MINUTE'].astype(int, errors='ignore')
df_person_2017['Time'] = df_person_2017['HOUR'].astype(str).str.cat(df_person_2017['MINUTE'].astype(str) ,sep=":")
df_person_2017['Time2'] = df_person_2017['HOUR'].astype(str).str.cat(df_person_2017['MINUTE'].astype(str) ,sep="")

In [None]:
# Set up sample series 
sample = thinkstats2.SampleRows(df_person_2017, 2000)
sample = sample.dropna(subset=['AGE', 'MOD_YEAR', 'Time','Time2'])
age, time, time2, vehicle_year = sample.AGE, sample.Time, sample.Time2, sample.MOD_YEAR

In [None]:
thinkplot.Scatter(age, time),
thinkplot.Show(xlabel='Age of Driver',
    ylabel='Time of Accident')
# , axis=[0, 24,1, 7])

In [None]:
print(thinkstats2.Cov(age.astype(int), time2.astype(int)))
print(thinkstats2.Corr(age.astype(int), time2.astype(int)))

In [None]:
thinkplot.Scatter(age, vehicle_year),
thinkplot.Show(xlabel='Age of Driver',
    ylabel='Vehicle Year of Manufacture')

In [None]:
print(thinkstats2.Cov(age.astype(int), vehicle_year.astype(int)))
print(thinkstats2.Corr(age.astype(int), vehicle_year.astype(int)))

In [None]:
class CorrelationPermute(thinkstats2.HypothesisTest):
    def TestStatistic(self, data):
        xs, ys = data
        test_stat = abs(thinkstats2.Corr(xs, ys))
        return test_stat

    def RunModel(self):
        xs, ys = self.data
        xs = np.random.permutation(xs)
        return xs, ys

In [None]:
df_temp = df_fatalities_2017.dropna(subset=['HOUR', 'DAY_WEEK'])
data = df_temp.HOUR.values, df_temp.DAY_WEEK.values
ht = CorrelationPermute(data)
pvalue = ht.PValue()

In [None]:
print(pvalue)

In [None]:
thinkstats2.Corr( df_fatalities_2017['HOUR'].astype(int), df_fatalities_2017['DAY_WEEK'].astype(int))

In [None]:
formula = 'DAY_WEEK ~ HOUR'
model = smf.ols(formula, data=df_fatalities_2017)
results = model.fit()
results.summary()