In [None]:
# Dependencies and Setup
# Dependencies and Setup
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as st
import os
from config import destination_zip, col_types

In [None]:
# read csv in zip pandas automatically selects the csv residing within the zip
accidents_df = pd.read_csv(destination_zip, 
                           usecols=col_types.keys(), 
                           dtype=col_types, 
                           parse_dates=['Start_Time'],
                           infer_datetime_format=True)

accidents_df.head()

In [None]:
# Chris's code starts here

In [None]:
#Isolate Start Time and sort by day of week
accidents_df["Start_Time"].dt.dayofweek.value_counts()

In [None]:
#Create DataFrame to store number of accidents per day of week
week_days_df = pd.DataFrame(accidents_df['Start_Time'].dt.dayofweek.value_counts())
week_days_df.sort_index(inplace=True)
week_days_df.set_index(pd.Index(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']), inplace=True)
week_days_df

In [None]:
#Create bargraph from DataFrame
plotdata = week_days_df.plot.bar(rot="vertical", title="Number of Accidents by Day of Week", legend=False)
plotdata.set_ylabel("Number of Accidents")

In [None]:
#Make a DataFrame to track on severity of crash by day of week
daysofweek = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
severity_values = [1, 2, 3, 4]
days = daysofweek * len(severity_values)
severity = np.repeat(severity_values,len(daysofweek))
severity_df = pd.DataFrame(zip(severity, days), columns=['Severity', 'Days'])
vals = []
for index, row in severity_df.iterrows():
    vals.append(accidents_df[(accidents_df['Severity'] == row['Severity']) & (accidents_df['Start_Time'].dt.dayofweek == daysofweek.index(row['Days']))]['ID'].count())
severity_df.insert(2, 'Value', vals)
severity_df

In [None]:
#Define the summary table
pivot_df = severity_df.pivot(index='Days', columns='Severity', values='Value')
pivot_df = pivot_df.loc[daysofweek]
pivot_df

In [None]:
severities = [1, 2, 3, 4]
pivot_df.loc[:,severities].plot.bar(stacked=True, figsize=(10,7))
plt.title("Severity of Crashes by Day of Week")
plt.ylabel("Number of Crashes")

In [None]:
# Chris's code ends here

In [None]:
# Ken's code starts here

#Grouping Data Into Seasons
# In order to group dates into seasons, we need to define seasons then group the dates into seasons column using  Meteorological seasons classifications for northern hemisphere
#Spring runs from March 1 to May 31;
#Summer runs from June 1 to Aug.31;
#Fall (Autumn) runs from September 1st thru Nov.30th and
#Winter runs from December 1 to 28 Feb


month_seasons = {1: 'winter',
                 2: 'winter',
                 3: 'spring',
                 4: 'spring',
                 5: 'spring',
                 6: 'summer',
                 7: 'summer',
                 8: 'summer',
                 9: 'fall',
                 10: 'fall',
                 11: 'fall',
                 12: 'winter'}

accidents_df['seasons'] = pd.to_datetime(accidents_df['Start_Time']).dt.month.map(month_seasons)
accidents_df.tail(50000)

In [None]:
# Sort  Accidents into seasons 

# Define order, these must match the values exactly
season_order = ['fall', 'winter', 'spring', 'summer']

# group by seasons, get count of ID (really just the accident count per season)
season_group = accidents_df.groupby(['seasons'])['ID'].count().reset_index()

# order values based on column season, and season_order
season_group = season_group.set_index('seasons').loc[season_order]

# plot bar chart
season_group = season_group.plot.bar(rot="vertical", title="Number of Accidents by season", legend=False)
plt.title("Severity Of Accidents By Season", fontsize=14, fontweight="bold")
plt.xlabel("Seasons", fontweight="bold")
plt.ylabel("Number of Accidents", fontweight="bold")

# Add additional formatting
plt.xticks(rotation=45)

season_group


In [None]:
Analysis_Pivot_Table = pd.pivot_table(accidents_df, index = 'ID', columns = 'seasons', values = 'Severity', aggfunc = 'count') 

print(Analysis_Pivot_Table)

In [None]:
# Ken's code starts here
#Grouping Data Into Seasons
# In order to group dates into seasons, we need to define seasons then group the dates into seasons column using  Meteorological seasons classifications for northern hemisphere
#Spring runs from March 1 to May 31;
#Summer runs from June 1 to Aug.31;
#Fall (Autumn) runs from September 1st thru Nov.30th and
#Winter runs from December 1 to 28 Feb


month_seasons = {1: 'winter',
                 2: 'winter',
                 3: 'spring',
                 4: 'spring',
                 5: 'spring',
                 6: 'summer',
                 7: 'summer',
                 8: 'summer',
                 9: 'fall',
                 10: 'fall',
                 11: 'fall',
                 12: 'winter'}

accidents_df['seasons'] = pd.to_datetime(accidents_df['Start_Time']).dt.month.map(month_seasons)
accidents_df.tail(50000)

In [None]:
#Create DataFrame to group number of accidents per season
# separate season data into bins
# Create Two bins for 'Winter' for being both at the beginning and end of the year.

season_group = accidents_df.groupby(['seasons'])


season_group = season_group['ID'].count().plot.bar(rot="vertical", title="Number of Accidents by season", legend=False)
season_group.set_ylabel("Number of Accidents")





In [None]:
# convert the value counts back into a dataframe

seasons=pd.DataFrame(accidents_df.groupby(['seasons'])['Severity'].value_counts())

seasons.sort_index(inplace=True)


seasons = seasons.rename(columns={'Severity': 'Accidents'}).reset_index()

print(seasons)


In [None]:
season_pivot = seasons.pivot(index='seasons', columns='Severity', values='Accidents')
season_pivot

In [None]:
# Side By Side Bar Chart 

season_pivot.plot(kind= 'bar')

plt.title("Severity Of Accidents By Season", fontsize=14, fontweight="bold")
plt.xlabel("Seasons", fontweight="bold")
plt.ylabel("Number of Accidents", fontweight="bold")
plt.legend(loc='best')

# Add additional formatting
plt.xticks(rotation=45)

In [None]:
# Stacked Bar Chart
season_pivot.plot(kind= 'bar',stacked=True)

plt.title("Severity Of Accidents By Season", fontsize=14, fontweight="bold")
plt.xlabel("Seasons", fontweight="bold")
plt.ylabel("Number of Accidents", fontweight="bold")

plt.legend(loc='best')

# Add additional formatting
plt.xticks(rotation=45)

# Ken's code ends here

# Colleen's code starts here

In [None]:
# TIME QUESTION 1: Does the time of day (early morning, rush hour, late night) affect NUMBER of accidents?

In [None]:
# Add a new column to the original DataFrame to parse out hours only from Start_Time column
accidents_df['Hour'] = accidents_df['Start_Time'].dt.hour
accidents_df.head()

In [None]:
#Create a dataframe to store number of accidents per hour of the day
accidents_per_hour = pd.DataFrame(accidents_df['Hour'].value_counts())
accidents_per_hour.sort_index(inplace=True)
accidents_per_hour

In [None]:
# TIME QUESTION 1 - VISUALIZATION: 
# Create a graph to show total # of accidents in 1 hour increments over 24 hrs timeframe
#     What time of day do most accidents occur?
#     How does this help identify time periods such as AM/PM rush hours?

plotdata1 = accidents_per_hour.plot.bar(rot="vertical", legend=False)

plotdata1.set_title("Total Accidents per Hour", fontsize=14, fontweight="bold")
plotdata1.set_xlabel("Hour of the Day (military time)", fontweight="bold")
plotdata1.set_ylabel("# of Accidents", fontweight="bold")
plotdata1.set_facecolor('ivory')

# Add additional formatting
plt.xticks(rotation=70)
plt.grid(color='#95a5a6', linestyle='--', linewidth=1, axis='y', alpha=0.7)

In [None]:
# TIME QUESTION 2: Does the time of day (early morning, rush hour, late night) affect SEVERITY of accidents?

In [None]:
# Group Data Into Time Periods
# In order to group accidents into time periods, we need to define time periods based on the above analysis.
# Early Morning: 0,1,2,3,4,5
# AM Rush Hour: 6,7,8,9
# Mid Day: 10,11,12,13,14
# PM Rush Hour: 15,16,17,18
# Late Evening: 19,20,21,22,23

# Establish bins for time periods
bins = [0,6,10,15,19,24]
    
# Create the names for the time period bins
time_period = ["Early Morning", "AM Rush Hour", "Mid Day", "PM Rush Hour", "Late Evening"]

# Categorize the hours of the day using the time period bins and pd.cut()
accidents_df["Time Period"]=pd.cut(accidents_df["Hour"],bins,labels=time_period)

#Display Time Periods Table
accidents_df

In [None]:
# Sense check count by type and sum of severtiy of accidents
accidents_per_hour = pd.DataFrame(accidents_df['Severity'].value_counts())
accidents_per_hour

In [None]:
accidents_per_hour.sum()

In [None]:
# TIME QUESTION 2 - VISUALIZATION:
# Create a bar graph that shows severity ranking of accidents during peak timeframes 
#   (in other words, during peak accident times (ie. rush hours) when are the most severe accidents likely to occur?)

# Convert the value counts back into a dataframe
time_period=pd.DataFrame(accidents_df.groupby(['Time Period'])['Severity'].value_counts())

# Sort index, in this case severity
time_period.sort_index(inplace=True)

# Rename value counts, because the index and column had conflicting names and reset index to turn it into a more normal dataframe
time_period = time_period.rename(columns={'Severity': 'Accidents'}).reset_index()

# Turn into a pivot table
time_period_pivot = time_period.pivot(index='Time Period', columns='Severity', values='Accidents')
severities = [1, 2, 3, 4]

# Create a graph using the time period pivot table
time_period_pivot.loc[:,severities].plot.bar(stacked=True, figsize=(10,7))

#Formatting
plt.title("Accident Severity by Time Period", fontsize=14, fontweight="bold")
plt.xlabel("Hour of the Day (military time)", fontweight="bold")
plt.ylabel("# of Accidents by Severity", fontweight="bold")

# Add additional formatting
plt.xticks(rotation=70)
plt.grid(color='#95a5a6', linestyle='--', linewidth=1, axis='y', alpha=0.7)


# Colleen's code ends here

In [None]:
# Eric's code starts here

# Hypothesis Testing
***
__Our Hypothesis:__
> The number of accidents differ per season.

__Null Hypothesis:__
> The occurrence of accidents are distributed equally across seasons. 

***
We start by getting the counts of accidents in each season

In [None]:
fall = {'Fall': accidents_df[accidents_df['seasons'] == 'fall'].count()}
winter = {'Winter': accidents_df[accidents_df['seasons'] == 'winter'].count()}
spring = {'Spring': accidents_df[accidents_df['seasons'] == 'spring'].count()}
summer = {'Summer': accidents_df[accidents_df['seasons'] == 'summer'].count()}
all_seasons = [fall, winter, spring, summer]

## ANOVA Test
***
The ANOVA test is utilized as a starting point simply to see if there's any statistically significant <br>differences between the seasons. If the $p-value>0.05$, we can stop looking at seasonal differences.

In [None]:
four_sesaon_anova = st.f_oneway(fall['Fall'], winter['Winter'], spring['Spring'], summer['Summer'])[1]
print(f"ANOVA p-value for all four seasons: {four_sesaon_anova}")

## Further Testing
***
The $p-value$ for comparing all four seasons is $1.37e^{-129}$ <br>Clearly, this is far less than $0.05$, indicating a need for further testing.
***

In [None]:
p_values = []

# Loop through all combinations of seasons
for season in all_seasons:
    season_name = [*season][0]
    for otherseason in [val for val in all_seasons if val.keys() != season.keys()]:
        otherseason_name = [*otherseason][0]
        # run the ANOVA test
        p_values.append(st.f_oneway(season[season_name], otherseason[otherseason_name])[1])
        
        print(f"""ANOVA test between {season_name} and {otherseason_name}
        P Value: {p_values[-1]}""")
print(f"\nThe largest p-value between seasons is {max(p_values)}")

## Conclusion
***
The largest $p-value$ from any combination of seasons is $4.60e^{-48}$, which is still far smaller than $0.05$ <br> Based on these results, we can confidently reject the null hypothesis of all seasonal accidents being distributed equally, 

In [None]:
# Eric's code ends here