In [None]:
# Dependencies and Setup
# Dependencies and Setup
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as st
import os
from config import destination_zip, col_types

In [None]:
# read csv in zip pandas automatically selects the csv residing within the zip
accidents_df = pd.read_csv(destination_zip, 
                           usecols=col_types.keys(), 
                           dtype=col_types, 
                           parse_dates=['Start_Time'],
                           infer_datetime_format=True)

accidents_df.head()

In [None]:
# Chris's code starts here

In [None]:
#Isolate Start Time and sort by day of week
accidents_df["Start_Time"].dt.dayofweek.value_counts()

In [None]:
#Create DataFrame to store number of accidents per day of week
week_days_df = pd.DataFrame(accidents_df['Start_Time'].dt.dayofweek.value_counts())
week_days_df.sort_index(inplace=True)
week_days_df.set_index(pd.Index(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']), inplace=True)
week_days_df

In [None]:
#Create bargraph from DataFrame
plotdata = week_days_df.plot.bar(rot="vertical", title="Number of Accidents by Day of Week", legend=False)
plotdata.set_ylabel("Number of Accidents")

In [None]:
#Make a DataFrame to track on severity of crash by day of week
daysofweek = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
severity_values = [1, 2, 3, 4]
days = daysofweek * len(severity_values)
severity = np.repeat(severity_values,len(daysofweek))
severity_df = pd.DataFrame(zip(severity, days), columns=['Severity', 'Days'])
vals = []
for index, row in severity_df.iterrows():
    vals.append(accidents_df[(accidents_df['Severity'] == row['Severity']) & (accidents_df['Start_Time'].dt.dayofweek == daysofweek.index(row['Days']))]['ID'].count())
severity_df.insert(2, 'Value', vals)
severity_df

In [None]:
#Define the summary table
pivot_df = severity_df.pivot(index='Days', columns='Severity', values='Value')
pivot_df = pivot_df.loc[daysofweek]
pivot_df

In [None]:
severities = [1, 2, 3, 4]
pivot_df.loc[:,severities].plot.bar(stacked=True, figsize=(10,7))
plt.title("Severity of Crashes by Day of Week")
plt.ylabel("Number of Crashes")

In [None]:
# Chris's code ends here

In [None]:
# Colleen's code starts here

In [None]:
# Colleen's code ends here

In [None]:
# Ken's code starts here
#Grouping Data Into Seasons
# In order to group dates into seasons, we need to define seasons then group the dates into seasons column using  Meteorological seasons classifications for northern hemisphere
#Spring runs from March 1 to May 31;
#Summer runs from June 1 to Aug.31;
#Fall (Autumn) runs from September 1st thru Nov.30th and
#Winter runs from December 1 to 28 Feb


month_seasons = {1: 'winter',
                 2: 'winter',
                 3: 'spring',
                 4: 'spring',
                 5: 'spring',
                 6: 'summer',
                 7: 'summer',
                 8: 'summer',
                 9: 'fall',
                 10: 'fall',
                 11: 'fall',
                 12: 'winter'}

accidents_df['seasons'] = pd.to_datetime(accidents_df['Start_Time']).dt.month.map(month_seasons)
accidents_df.tail(50000)

In [None]:
#Create DataFrame to group number of accidents per season
# separate season data into bins
# Create Two bins for 'Winter' for being both at the beginning and end of the year.

season_group = accidents_df.groupby(['seasons'])


season_group = season_group['ID'].count().plot.bar(rot="vertical", title="Number of Accidents by season", legend=False)
season_group.set_ylabel("Number of Accidents")





In [None]:
# Ken's code ends here

In [None]:
# Eric's code starts here

# Hypothesis Testing
***
__Our Hypothesis:__
> The number of accidents differ per season.

__Null Hypothesis:__
> The occurrence of accidents are distributed equally across seasons. 

***
We start by getting the counts of accidents in each season

In [None]:
fall = {'Fall': accidents_df[accidents_df['seasons'] == 'fall'].count()}
winter = {'Winter': accidents_df[accidents_df['seasons'] == 'winter'].count()}
spring = {'Spring': accidents_df[accidents_df['seasons'] == 'spring'].count()}
summer = {'Summer': accidents_df[accidents_df['seasons'] == 'summer'].count()}
all_seasons = [fall, winter, spring, summer]

## ANOVA Test
***
The ANOVA test is utilized as a starting point simply to see if there's any statistically significant <br>differences between the seasons. If the $p-value>0.05$, we can stop looking at seasonal differences.

In [None]:
four_sesaon_anova = st.f_oneway(fall['Fall'], winter['Winter'], spring['Spring'], summer['Summer'])[1]
print(f"ANOVA p-value for all four seasons: {four_sesaon_anova}")

## Further Testing
***
The $p-value$ for comparing all four seasons is $1.37e^{-129}$ <br>Clearly, this is far less than $0.05$, indicating a need for further testing.
***

In [None]:
p_values = []

# Loop through all combinations of seasons
for season in all_seasons:
    season_name = [*season][0]
    for otherseason in [val for val in all_seasons if val.keys() != season.keys()]:
        otherseason_name = [*otherseason][0]
        # run the ANOVA test
        p_values.append(st.f_oneway(season[season_name], otherseason[otherseason_name])[1])
        
        print(f"""ANOVA test between {season_name} and {otherseason_name}
        P Value: {p_values[-1]}""")
print(f"\nThe largest p-value between seasons is {max(p_values)}")

## Conclusion
***
The largest $p-value$ from any combination of seasons is $4.60e^{-48}$, which is still far smaller than $0.05$ <br> Based on these results, we can confidently reject the null hypothesis of all seasonal accidents being distributed equally, 

In [None]:
# Eric's code ends here