In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style = 'darkgrid')

import geopandas as gpd
from keplergl import KeplerGl

_________
**Reading in Accident Data**
__________

In [2]:
# Read in Dataframe with All the accident data
acc_df = pd.read_csv('../data/US_Accidents_Dec20.csv')
acc_df.head(1)

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night


In [3]:
# lower and eliminate spaces in column names
acc_df.columns = acc_df.columns.str.lower().str.replace(' ', '_')

In [4]:
# dropping unneccessary columns or columns with lots of nulls
acc_df.drop(columns = ['id','source','end_lat','end_lng',
                       'description','number','street',
                       'country','timezone','weather_timestamp', 
                       'airport_code', 'zipcode', 'tmc'], inplace = True)

__________________
**Reading in Population Density Data**
_______________

In [5]:
pop_dense_cnty = pd.read_csv('../data/pop_density_county.csv', index_col=0)

In [6]:
# lower and eliminate spaces in column names
pop_dense_cnty.columns = pop_dense_cnty.columns.str.lower().str.replace(' ', '_')

In [7]:
pop_dense_cnty.head(1)

Unnamed: 0_level_0,county_name,density_mi
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,New York,69468


_________
**Read in License Data**, Format into DF, Prepare for Concatenation
________

In [8]:
# Read in Dataframe with data on driver's licensing in each state
license_df = pd.read_csv('../data/licensed_drivers_by_state.csv')

In [9]:
license_df.head(1)

Unnamed: 0,STATE,UNDER,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,OVER,total_drivers
0,Alabama,219648,327163,347679,320583,307331,292967,311675,309970,339239,327915,286986,242004,168934,113483,110574,4026151


In [10]:
# lower and eliminate spaces in column names for license_df
license_df.columns = license_df.columns.str.lower().str.replace(' ', '_')

In [11]:
# eliminating commas from total drivers column
license_df['total_drivers'] = license_df['total_drivers'].str.replace(',', '')
# converting total_drivers column to integer type
license_df['total_drivers'] = license_df['total_drivers'].astype(int)

In [12]:
# creating df with only state column and total drivers columns
license_df = license_df.loc[:, ['state', 'total_drivers']]

In [13]:
license_df['state'] = license_df['state'].str.lower()

In [14]:
# replacing dist of col. with district of columbia so I can concatenate with other dfs
license_df['state'].replace('dist. of col. ', 'district of columbia', inplace = True)

In [15]:
# setting the state to index and dropping alaska and hawaii since they're not in accident data
license_df.set_index('state', inplace = True)
license_df.drop(['alaska', 'hawaii'], inplace = True)

In [16]:
license_df.shape

(49, 1)

In [17]:
# sorting index alphabetically and ensuring proper format
license_df.sort_index(inplace = True)
license_df.index = license_df.index.str.lower().str.strip()

______
**Read in Shape Files**, Format into DF and Prepare for Concatenation
______

In [18]:
national_shape = gpd.read_file('../data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp')

In [19]:
# Dropping island territories
national_shape.drop([13, 27, 37, 38, 42, 44, 45], inplace = True)

In [20]:
# lower and eliminate spaces in column names
national_shape.columns = national_shape.columns.str.lower().str.replace(' ', '_')

In [21]:
# converting state name column to lowercase
national_shape['name'] = national_shape['name'].str.lower()

In [22]:
national_shape.shape

(49, 10)

In [23]:
# Setting index to name of state to prepare for concatenation
national_shape.set_index(keys = 'name', inplace = True)

In [24]:
# Sorting index alphabetically and ensuring proper format
national_shape.sort_index(inplace = True)
national_shape.index = national_shape.index.str.lower().str.strip()

________
**Create State Accident Counts**, Form into DF and Prepare for Concatenation
________

In [25]:
# Grouping acc_df by state and getting count of all accidents by state
# using severity column to get counts but I will change it to a column called counts later. 
state_count = acc_df.groupby('state').count()[['severity']]
state_count.head()

Unnamed: 0_level_0,severity
state,Unnamed: 1_level_1
AL,56989
AR,5089
AZ,93038
CA,971856
CO,54028


In [26]:
# resetting index to numeric so I can map full state names to abbreviated names
state_count = state_count.reset_index()

In [27]:
state_count.index

RangeIndex(start=0, stop=49, step=1)

In [28]:
state_remap = {'AL': 'alabama', 'AR': 'arkansas', 'AZ': 'arizona', 'CA': 'california', 'CO': 'colorado',
 'CT': 'connecticut', 'DC': 'district of columbia', 'DE': 'delaware', 'FL': 'florida',
 'GA': 'georgia', 'IA': 'iowa', 'ID': 'idaho', 'IL': 'illinois',
 'IN': 'indiana', 'KS': 'kansas', 'KY': 'kentucky', 'LA': 'louisiana',
 'MA': 'massachusetts', 'MD': 'maryland', 'ME': 'maine', 'MI': 'michigan',
 'MN': 'minnesota', 'MO': 'missouri', 'MS': 'mississippi', 'MT': 'montana',
 'NC': 'north carolina', 'ND': 'north dakota', 'NE': 'nebraska',
 'NH': 'new hampshire', 'NJ': 'new jersey', 'NM': 'new mexico', 'NV': 'nevada',
 'NY': 'new york', 'OH': 'ohio', 'OK': 'oklahoma', 'OR': 'oregon',
 'PA': 'pennsylvania', 'RI': 'rhode island', 'SC': 'south carolina',
 'SD': 'south dakota', 'TN': 'tennessee', 'TX': 'texas', 'UT': 'utah',
 'VA': 'virginia', 'VT': 'vermont', 'WA': 'washington',
 'WI': 'wisconsin', 'WV': 'west virginia', 'WY':  'wyoming'}

In [29]:
# mapping full names to abbreviated state names
state_count['state'] = state_count['state'].map(state_remap)

In [30]:
# renaming id column to count.
state_count.rename(columns = {'severity': 'count', }, inplace = True)

In [31]:
state_count.head(1)

Unnamed: 0,state,count
0,alabama,56989


In [32]:
# resetting index to state so that it can be concatenated with license data
state_count.set_index(keys = 'state', inplace = True)

In [33]:
# sorting index alphabetically and ensuring proper format
state_count.sort_index(inplace = True)
state_count.index = state_count.index.str.lower().str.strip()

In [34]:
state_count.shape

(49, 1)

______
**Concatenating Grouped DF's**
_______

In [35]:
# combining state_count, national_shape and license_df into one df. 
national_group = pd.concat([state_count, national_shape, license_df], axis = 1, verify_integrity = True)
national_group.shape

(49, 11)

In [36]:
national_group.columns

Index(['count', 'statefp', 'statens', 'affgeoid', 'geoid', 'stusps', 'lsad',
       'aland', 'awater', 'geometry', 'total_drivers'],
      dtype='object')

In [37]:
national_group.head(1)

Unnamed: 0,count,statefp,statens,affgeoid,geoid,stusps,lsad,aland,awater,geometry,total_drivers
alabama,56989,1,1779775,0400000US01,1,AL,0,131174048583,4593327154,"MULTIPOLYGON (((-88.05338 30.50699, -88.05109 ...",4026151


In [38]:
national_group['acc_per_cap'] = national_group['count'] / national_group['total_drivers']

In [39]:
national_group.head()

Unnamed: 0,count,statefp,statens,affgeoid,geoid,stusps,lsad,aland,awater,geometry,total_drivers,acc_per_cap
alabama,56989,1,1779775,0400000US01,1,AL,0,131174048583,4593327154,"MULTIPOLYGON (((-88.05338 30.50699, -88.05109 ...",4026151,0.014155
arizona,93038,4,1779777,0400000US04,4,AZ,0,294198551143,1027337603,"POLYGON ((-114.81629 32.50804, -114.81432 32.5...",5369210,0.017328
arkansas,5089,5,68085,0400000US05,5,AR,0,134768872727,2962859592,"POLYGON ((-94.61783 36.49941, -94.61765 36.499...",2153929,0.002363
california,971856,6,1779778,0400000US06,6,CA,0,403503931312,20463871877,"MULTIPOLYGON (((-118.60442 33.47855, -118.5987...",27213650,0.035712
colorado,54028,8,1779779,0400000US08,8,CO,0,268422891711,1181621593,"POLYGON ((-109.06025 38.59933, -109.05954 38.7...",4235384,0.012756


___________
**Convert acc_df to a Geopandas Dataframe**
___________


In [40]:
# dropping rows that I won't be using in the plots
acc_df.drop(columns = ['temperature(f)', 'wind_chill(f)', 'humidity(%)', 'pressure(in)', 'visibility(mi)', 
                       'wind_direction', 'wind_speed(mph)', 'precipitation(in)', 'weather_condition', 
                       'sunrise_sunset', 'civil_twilight', 'nautical_twilight', 'astronomical_twilight'], inplace = True)

In [41]:
# converting acc_df to geopandas df
# gdp is the geopandas imported as gpd
# GeoDataFrame is a function that converts objects to GeoPandas DF's
# Since There is not geometry object (column) we need to use .points_from_xy
# .points_from_xy() function converts latitude and longitude columns to geometry object. 
# geometry object will be used to plot in geospatial data. 

acc_gdf = gpd.GeoDataFrame(
    acc_df, geometry = gpd.points_from_xy(acc_df['start_lat'], acc_df['start_lng']))

__________________
**Splitting Up acc_gdf into smaller portions**
__________________

In [42]:
# Setting up the length of each new split dataframe
# Splitting 10 times so multiplying length of the dataframe by .10
len_new = int(round(len(acc_gdf) * .10, 0))

# Creating 10 new dataframes of 10 percent of total samples in each new dataframe. 
# Now I will use len_new to index the new dataframes
acc_gdf1 = acc_gdf.iloc[0:len_new, :]
acc_gdf2 = acc_gdf.iloc[len_new:(len_new * 2),  :]
acc_gdf3 = acc_gdf.iloc[(len_new * 2):(len_new * 3),  :]
acc_gdf4 = acc_gdf.iloc[(len_new * 3): (len_new * 4), :]
acc_gdf5 = acc_gdf.iloc[(len_new * 4): (len_new * 5), :]
acc_gdf6 = acc_gdf.iloc[(len_new * 5): (len_new * 6), :]
acc_gdf7 = acc_gdf.iloc[(len_new * 6): (len_new * 7), :]
acc_gdf8 = acc_gdf.iloc[(len_new * 7): (len_new * 8), :]
acc_gdf9 = acc_gdf.iloc[(len_new * 8): (len_new * 9), :]
acc_gdf10 = acc_gdf.iloc[(len_new * 9): (len_new * 10), :]

In [44]:
# Saving all of the split dfs to csv so that I can use keppler on the web.
acc_gdf1.to_csv('../data/acc_gdf1.csv') 
acc_gdf2.to_csv('../data/acc_gdf2.csv')
acc_gdf3.to_csv('../data/acc_gdf3.csv') 
acc_gdf4.to_csv('../data/acc_gdf4.csv') 
acc_gdf5.to_csv('../data/acc_gdf5.csv') 
acc_gdf6.to_csv('../data/acc_gdf6.csv') 
acc_gdf7.to_csv('../data/acc_gdf7.csv') 
acc_gdf8.to_csv('../data/acc_gdf8.csv') 
acc_gdf9.to_csv('../data/acc_gdf9.csv') 
acc_gdf10.to_csv('../data/acc_gdf10.csv')

_________
**Plotting**
__________

_____________________________
# Kepler.gl Plot
Created a basic national map with accident count, licensed drivers and percentage of accidents per capita (derived by dividing accident count by licensed drivers per state)


In [43]:
map_acc = KeplerGl(height = 400)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [None]:
# Adding split dataframes to Keppler using add_data() function.
# I had to split the dataframes (see above) into managable data sizes. Luckily keppler allows you to add as many times as necessary. 

map_acc.add_data(data = acc_gdf1, name = 'Accidents')
map_acc.add_data(data = acc_gdf2, name = 'Accidents')
map_acc.add_data(data = acc_gdf3, name = 'Accidents')
map_acc.add_data(data = acc_gdf4, name = 'Accidents')
map_acc.add_data(data = acc_gdf5, name = 'Accidents')
map_acc.add_data(data = acc_gdf6, name = 'Accidents')
map_acc.add_data(data = acc_gdf7, name = 'Accidents')
map_acc.add_data(data = acc_gdf8, name = 'Accidents')
map_acc.add_data(data = acc_gdf9, name = 'Accidents')
map_acc.add_data(data = acc_gdf10, name = 'Accidents')

In [None]:
# display function calls added data and as you add each new layer (above) you will see the data update into this cell.
# The interactive map displays here.
display(map_acc)

**Keppler plot of just the State Counts and Per Capita Counts**

In [None]:
ng_gdf = gpd.GeoDataFrame(national_group)

In [None]:
ng_gdf = ng_gdf.loc[:, ['count', 'geometry', 'total_drivers', 'acc_per_cap']]

In [None]:
# Instantiate KeplerGl
map_1 = KeplerGl(height = 400)

In [None]:
# Add data to KeplerGl instance
map_1.add_data(data = ng_gdf, name = 'NationalAccidents')

In [None]:
# display(map_1)

### Other GeoSpatial Plots
https://geopandas.org/gallery/create_geopandas_from_pandas.html



In [None]:
gdf = geopandas.GeoDataFrame(
    df, geometry=geopandas.points_from_xy(df.Longitude, df.Latitude))

world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

# We restrict to South America.
ax = world[world.continent == 'South America'].plot(
    color='white', edgecolor='black')

# We can now plot our ``GeoDataFrame``.
gdf.plot(ax=ax, color='red')

plt.show()


_______________
## Bar Plots of Accidents Per State, County and City

___________
**Accidents Per State**
____________

In [None]:
# Create a df with accident count and total licensed drivers per state. 
acc_per_state = pd.concat([state_count, license_df], axis = 1)
acc_per_state.head(1)

In [None]:
# creating a accident per capita column
acc_per_state['acc_per_cap'] = acc_per_state['count'] / acc_per_state['total_drivers']
acc_per_state.head(1)

In [None]:
# resetting index to numeric index
acc_per_state = acc_per_state.reset_index()

______________
**Plot of Accidents by State**
______________

In [None]:
# creating new df of acc_per_state sorted by count
acc_count = acc_per_state.sort_values('count', ascending = False)

# creating variables to house plotting coordinates
x = list(acc_count['count'])
bars = list(acc_count['state'])
y_pos = np.arange(len(bars))

# plotting
plt.figure(figsize = (12,9))
plt.barh(y_pos, x, color = 'red')
plt.yticks(y_pos, bars, color = 'black')
plt.xticks(color = 'black')
plt.xlabel('Accident Count', color = 'black')
plt.ylabel('State', color = 'black')
plt.title('Accidents by State', color = 'black')
plt.show()

**Analysis of Plot of Accidents by State**
__________________

* Not surprisingly California, Texas and Florida rank highest in number of accidents per state. Afterall, they are the three largest states. **What was surprising, however, is North and South Carolina in the top five.** What factors contribute to such a high number of accidents in these two states?

* Looking at the bottom tier of states appears to confirm the relationship between population and number of accidents. Again, this is not surprising. 
____________________

______________
**Plot of Largest Per Capita Accidents Rates By State**
_______________________________

In [None]:
# creating new df of acc_per_state sorted by count
acc_pcapita = acc_per_state.sort_values('acc_per_cap', ascending = False)

# creating variables to house plotting coordinates
x = list(acc_pcapita['acc_per_cap'])
bars = list(acc_pcapita['state'])
y_pos = np.arange(len(bars))

# plotting
plt.figure(figsize = (12,9))
plt.barh(y_pos, x, color = 'red')
plt.yticks(y_pos, bars, color = 'black')
plt.xticks(color = 'black')
plt.xlabel('Per Capita Accident Rate', color = 'black')
plt.ylabel('State', color = 'black')
plt.title('Per Capita Accident Rate by State', color = 'black')
plt.show()

**Analysis of Largest Per Capita Accidents By State Plot**
__________________
* Based on the chart of total number of accidents in each state we would expect to see South Carolina and North Carolina in the top 10. Interestingly, California remains in top 3 on a per-capita basis. In light of this it appears that anecdotal evidence is confirmed by the data to be a relatively challenging state to drive in.

* North Dakot and South Dakota remain at the bottom of the charts when accident totals are adjusted on a per-capita basis. **This highlights a potential relationship between population density and accidents.** 

* Possible exceptions to population density, when considering this on a state level, is Oregon and Utah which are both in top-ten on a per-capita basis and have a diverse population of rural and urban populations. **It may be worth looking into these states further to identify per-capita proportions of accident totals on a rural county or city level.**
_____________________

____________________________
**Plot of Top 25 Accidents Per County**
_______________________

In [None]:
# Creating a new df with accidents counts per county
acc_per_county = acc_df.groupby('county')[['severity']].count()
# Renaming severity column to count
acc_per_county.rename(columns = {'severity': 'count'}, inplace = True)
acc_per_county.head(1)

In [None]:
# creating new df of acc_per_county sorted by count
acc_county_count_25 = acc_county_count.nlargest(25, 'count')

# creating variables to house plot coordinates
x = list(acc_county_count_25['count'])
bars = list(acc_county_count_25.index)
y_pos = np.arange(len(bars))

# plot
plt.figure(figsize = (12,9))
plt.barh(y_pos, x, color = 'red')
plt.yticks(y_pos, bars, color = 'black')
plt.xticks(color = 'black')
plt.xlabel('Accident Count', color = 'black')
plt.ylabel('County', color = 'black')
plt.title('Per County Accident Count', color = 'black')
plt.show()

**Analysis of Plot of Accident Count Per County**
* Los Angeles County, is a driver-centric metropolis tops this list almost tripling the accident count of Harris County, Texas. Harris County, the third largest county in the United States is the home of Houston and is a sprawling county growing at a 15% clip annually. These values are to be expected at the top of the chart. 

* I would like to do more analysis on population density and accident rates. If you look at the top counties for accident count Many of them are not in the top 50 of population density. So it almost seems as if at some point population density increases then motor vehicle accidents then decrease. This is likely due to the fact that less people drive and more take mass transit. For instance, Los Angeles County is one of the largest counties by population in the United States but doesn't even break the top 50 counties based on population density. Neither does number two Harris County and number three Orange County ranks 32nd. In fact, Cook County (ranked 19th in population density) is the only county in the list of top 25 of total accident counts that cracks the top 20 via population density. 
* This lack of extreme dense population in the highest accident count counties suggests that larger-by-land-mass, sprawling cities that are more dependent on personal motor vehicles for travel are certainly more risky than densely populated metropolis' that have a good mix of public and private transportation options. However, further research is needed here. 
* This article published on PBS illustrates some of the reason higher-density areas have less crash rates. In a four-year long study at the University of Pennsylvania co-author of the study, Erick Guerra explains that "Fewer roads and slower traffic speeds in Philly explain some of the difference in crash rates. The region’s densest census tracts house 28% of the Delaware Valley’s population, but just 6% of all roadways and far fewer high-speed boulevards or highways. Across the five-county region, roads with average speeds of 45 miles-per-hour witnessed 10 times more deaths on average than roads with 25 miles-per-hour speed limits, the study found."
* Another study found that severity of crashes increases as areas become more rural. This is partly due to people wearing their seatbelts less in rural areas, higher speeds in which people travel in rural areas, increased per capital impaired driving rates and less proximity to trauma centers that can triage severe accidents. There was also some mention of people in rural areas tending to have lower wages and thus driving older cars with outdated or less existent safety features. 


**Sources**

List of most populous counties in the United States
* https://en.wikipedia.org/wiki/List_of_the_most_populous_counties_in_the_United_States

List of most densely populated counties in the United States. 

* https://en.wikipedia.org/wiki/County_statistics_of_the_United_States#Most_densely_populated

City driving often safer than the burbs
* https://whyy.org/articles/study-city-driving-often-safer-than-the-burbs/

Car crash death rates highest in remotest rural areas
* https://www.reuters.com/article/us-health-rural-autos-crash/car-crash-death-rates-highest-in-remotest-rural-areas-idUSKBN1CA2EW



___________
**Plot of Top 25 Accidents Per City**
____________

In [None]:
acc_per_city = acc_df.groupby('city')[['severity']].count()
acc_per_city.rename(columns = {'severity': 'count'}, inplace = True)

In [None]:
# creating new df of acc_per_city sorted by count
acc_city_count_25 = acc_per_city.nlargest(25, 'count')

# creating variables to house plot coordinates
x = list(acc_city_count_25['count'])
bars = list(acc_city_count_25.index)
y_pos = np.arange(len(bars))

# plot
plt.figure(figsize = (12,9))
plt.barh(y_pos, x, color = 'red')
plt.yticks(y_pos, bars, color = 'black')
plt.xticks(color = 'black')
plt.xlabel('Accident Count', color = 'black')
plt.ylabel('City', color = 'black')
plt.title('Per City Accident Count', color = 'black')
plt.show()

____________
### Plot of Crossing and Severity
A crossing refers to any crossing across roads for pedestrians, cyclists, etc. 

In [None]:
total_crossing = acc_df['crossing'].sum()
perc_crossing = round((total_crossing / len(acc_df) * 100), 2)
print(f"There are {total_crossing} reported crossings in this dataset comprising {perc_crossing}% of the samples.")

In [None]:
acc_df['crossing'] = acc_df['crossing'].map({True: 1, False: 0})

In [None]:
plt.figure(figsize=(8,6))
sns.set(font_scale=2, palette= ['orange', 'red'])
sns.countplot(data = acc_df, x = 'severity', hue = 'crossing')
plt.title('Crossing and Severity')
plt.legend(['No Crossing', 'Crossing'])
plt.ylabel('Frequency')
plt.xlabel('Severity')
plt.show();

**Analysis of Crossing and Severity Plot**
* The vast majority of all accidents that happened at a crossing had a severity of level 2. 
___________________

______
### Plot of Junction and Severity
A junction refers to any highway ramp, exit or entrance. 

In [None]:
total_junction = acc_df['junction'].sum()
perc_junction = round((total_junction / len(acc_df) * 100), 2)
print(f"There are {total_junction} reported junctions in this dataset comprising {perc_junction}% of the samples.")

In [None]:
acc_df['junction'] = acc_df['junction'].map({True: 1, False: 0})

In [None]:
plt.figure(figsize=(8,6))
sns.set(font_scale=2, palette= ['orange', 'red'])
sns.countplot(data = acc_df, x = 'severity', hue = 'junction')
plt.title('Junction and Severity')
plt.legend(['No Junction', 'Junction'])
plt.ylabel('Frequency')
plt.xlabel('Severity')
plt.show();

**Analysis of Junction and Severity**
* Of all the categorical features plotted to this point junction comprises the largest percentage of values accounting for around 8% of the dataset. It appears that the majority of accidents that happen near a highway ramp, exit or entrance have a severity of level 2 and many have a severity of level 3. Very few, if any have level 1 or 4. 

______
### Plot of Station and Severity
Refers to a public transportation station (bus, metro, etc.)

In [None]:
total_station = acc_df['station'].sum()
perc_station = round((total_station / len(acc_df) * 100), 2)
print(f"There are {total_station} reported stations in this dataset comprising {perc_station}% of the samples.")

In [None]:
acc_df['station'] = acc_df['station'].map({True: 1, False: 0})

In [None]:
plt.figure(figsize=(8,6))
sns.set(font_scale=2, palette= ['orange', 'red'])
sns.countplot(data = acc_df, x = 'severity', hue = 'station')
plt.title('Station and Severity')
plt.legend(['No Station', 'Station'])
plt.ylabel('Frequency')
plt.xlabel('Severity')
plt.show();

**Analysis of Plot of Station and Severity**
* It appears that mostly all of the accidents that happen near a public transportation station have a severity level of 2. 
________

___________
### Plot of Stop and Severity
Refers to a stop sign at scene of accident. 

In [None]:
total_stop = acc_df['stop'].sum()
perc_stop = round((total_stop / len(acc_df) * 100), 2)
print(f"There are {total_stop} reported stop signs in this dataset comprising {perc_stop}% of the samples.")

In [None]:
acc_df['stop'] = acc_df['stop'].map({True: 1, False: 0})

In [None]:
plt.figure(figsize=(8,6))
sns.set(font_scale=2, palette= ['orange', 'red'])
sns.countplot(data = acc_df, x = 'severity', hue = 'stop')
plt.title('Station and Stop Sign')
plt.legend(['No Stop Sign', 'Stop Sign'])
plt.ylabel('Frequency')
plt.xlabel('Severity')
plt.show();

**Analysis of Stop Sign and Severity**
* It appears that nearly all accidents that happen at a stop sign have a severity of level 2. I feel the amount of incidents reported at stop signs seems low. This may have to do with the integrity of the recording process but it may also be right. No way to know for sure at this point. 
________

___________
### Plot of Traffic Signal and Severity
Refers to traffic signal on intersections present at accident site. 

In [None]:
total_signal = acc_df['traffic_signal'].sum()
perc_signal = round((total_signal / len(acc_df) * 100), 2)
print(f"There are {total_signal} reported traffic signals in this dataset comprising {perc_signal}% of the samples.")

In [None]:
acc_df['signal'] = acc_df['signal'].map({True: 1, False: 0})

In [None]:
plt.figure(figsize=(8,6))
sns.set(font_scale=2, palette= ['orange', 'red'])
sns.countplot(data = acc_df, x = 'severity', hue = 'traffic_signal')
plt.title('Station and Traffic Signal')
plt.legend(['No Traffic Signal', 'Traffic Signal'])
plt.ylabel('Frequency')
plt.xlabel('Severity')
plt.show();

**Analysis of Plot of Traffic Singal and Severity**

Traffic signal has been the largest of the categorical feature variables analayzed against severity to this point. The presence of a traffic signal may reduce the severity of an accident slightly as you can see the proportion of level 2 and level 3 accidents that do not have a traffic signal seems to be smaller than the proportion of level 2 over level 3 accidents that occur when a traffic signal is present. This signifies a benefit to having traffic signals present at an intersection. 

___________
### Categorical Features that do not offer enough data to analyze


**Give-Way Feature**
Give-way refers to a sign on road which shows priority of passing.  


In [None]:
total_giveway = acc_df['give_way'].sum()
perc_giveway = round((total_giveway / len(acc_df) * 100), 2)
print(f"There are {total_giveway} reported give-ways in this dataset comprising {perc_giveway}% of the samples.")

**Amenity Feature**
In the context of this data an ammenity refers to a particular place such as restaurant, library, college, bar, etc. where an accident occurred.  

In [None]:
total_amenity = acc_df['amenity'].sum()
perc_amenity = round((total_amenity / len(acc_df) * 100), 2)
print(f"There are {total_amenity} reported amenties in this dataset comprising {perc_ammenity}% of the samples.")

**Bump Feature**Refers to speed bump or hump to reduce the speed. 

In [None]:
total_bump = acc_df['bump'].sum()
perc_bump = round((total_bump / len(acc_df) * 100), 2)
print(f"There are {total_bump} reported bumps in this dataset comprising {perc_bump}% of the samples.")

**No-exit Feature**
No-exit indicates there is no possibility to travel further by any transport mode along a formal path or route. 

In [None]:
total_noexit = acc_df['no_exit'].sum()
perc_noexit = round((total_noexit / len(acc_df) * 100), 2)
print(f"There are {total_noexit} reported no-exits in this dataset comprising {perc_noexit}% of the samples.")

**Railway Feature**Indicates the presence of railways near accident. 

In [None]:
total_rail = acc_df['railway'].sum()
perc_rail = round((total_rail / len(acc_df) * 100), 2)
print(f"There are {total_rail} reported railways in this dataset comprising {perc_rail}% of the samples.")

**Give-Way Feature**
Give-way is a sign on road which shows priority of passing. 


In [None]:
total_giveway = acc_df['give_way'].sum()
perc_giveway = round((total_giveway / len(acc_df) * 100), 2)
print(f"There are {total_giveway} reported give-ways in this dataset comprising {perc_giveway}% of the samples.")

**Roundabout Feature**
Refers to a circular road junction.

In [None]:
total_rbout = acc_df['roundabout'].sum()
perc_rbout = round((total_rbout / len(acc_df) * 100), 2)
print(f"There are {total_rbout} reported roundabout in this dataset comprising {perc_rbout}% of the samples.")

**Traffic Calming Feature** Refers to any means for slowing down traffic speed. This is interesting feature since bump is also a feature and bump would fall under this category. Maybe it would efficeint to combine these two features. 

In [None]:
total_calm = acc_df['traffic_calming'].sum()
perc_calm = round((total_calm / len(acc_df) * 100), 2)
print(f"There are {total_calm} reported traffic calming mechanisms in this dataset comprising {perc_calm}% of the samples.")

**Turning Loop Feature**
Indicates a widened area of a highway with a non-traverable island for turning around. 

In [None]:
total_loop = acc_df['turning_loop'].sum()
perc_loop = round((total_loop / len(acc_df) * 100), 2)
print(f"There are {total_loop} reported turning loops in this dataset comprising {perc_loop}% of the samples.")

____________________

### Start_Time and Severity

In [None]:
# Converting acc_df to geopandas df. 
# This also converts latitude and longitude columns to geomtry columns (these are shapely objects)
# https://gis.stackexchange.com/questions/174159/converting-pandas-dataframe-to-geodataframe/258376#258376
gdf = gpd.GeoDataFrame(acc_df, 
                             geometry=gpd.points_from_xy(x=acc_df['start_lng'], y=acc_df['start_lat']))

In [None]:
map_2 = KeplerGl(height = 400)

In [None]:
# Add data to KeplerGl instance
map_2.add_data(data = gdf, name = 'Accidents')

In [None]:
display(map_2)

In [None]:
location_df = acc_df.loc[:, ['start_lat', 'start_lng']]

In [None]:
loc_gdf = gpd.GeoDataFrame(location_df, 
                             geometry=gpd.points_from_xy(x=acc_df['start_lng'], y=acc_df['start_lat']))

In [None]:
map_3 = KeplerGl(height = 400)

In [None]:
map_3.add_data(data = gdf, name = "Accident Location")

In [None]:
display(map_3)

In [None]:
ng_gdf.plot(figsize = (20,20));

In [None]:
national_shape.plot(, column='id', cmap='Oranges', figsize=(40, 80))
acc_df.plot(figsize = (40, 80))
plt.show()

In [None]:
us_acc.plot(kind = "scatter", x="Start_Lng",y="Start_Lat",alpha = 0.009)

In [None]:
us_acc.plot(kind = "scatter", x="Start_Lng",y="Start_Lat",alpha = 0.009,c="Severity", 
            cmap=plt.get_cmap("jet"), colorbar = False, figsize=(15,7))
plt.figure(figsize=(20,12))
plt.show()