In [None]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from zipfile import ZipFile, ZIP_DEFLATED
import datetime
import os

# Dataset is too large to host on github (100MB limit), 
# download from link below and place zipped file in Resources folder to recreate these steps
# https://www.kaggle.com/sobhanmoosavi/us-accidents

# get absolute file path
current_directory = os.getcwd() 
path = os.path.join(current_directory,"Resources","199387_1319582_bundle_archive.zip")

file_name = "US_Accidents_June20.csv"

destination = os.path.join(current_directory, "Resources", "US_Accidents_Cleaned.csv")
destination_zip = "US_Accidents.zip"

In [None]:
# unzip file
zin = ZipFile(path)

# read csv
accidents_df = pd.read_csv(zin.open(file_name))

# drop columns
drop_columns = ['Source', 'TMC', 'End_Time', 'End_Lat', 'End_Lng', 'Description', 
                'Distance(mi)', 'Number', 'Street', 'Side', 'Country', 'Timezone', 'Airport_Code', 
                'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 
                'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 
                'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 
                'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']
accidents_df = accidents_df.drop(columns=drop_columns)

accidents_df.head()

In [None]:
# change type of Start_Time to datetime
accidents_df.Start_Time = pd.to_datetime(accidents_df.Start_Time)

In [None]:
# Remove dates after the end of January 2020
cutoff_date = pd.to_datetime('2020-02-01 00:00:00')
accidents_df = accidents_df.drop(accidents_df[accidents_df.Start_Time > cutoff_date].index)
accidents_df

In [None]:
# write csv to Resources
accidents_df.to_csv(destination, index=False)

# close the initial opened file
zin.close()

In [None]:
# Zip up csv
zout = ZipFile(destination_zip, "w", ZIP_DEFLATED)
zout.write(destination)
zout.close()

In [None]:
# Delete extraneous files