# This notebook is made for a part of Coventry University's coursework.
Dataset is accessible on https://www.kaggle.com/sobhanmoosavi/us-accidents

## This is part one of the coursework - data cleaning

Produced by Sunggu Choi

In [34]:
#Load necessary library
import pandas as pd
import gc
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [35]:
#Read csv file into Panads Dataframe
df = pd.read_csv ('US_Accidents_June20.csv')

### Dropping columns
Since some columns are unable to use for data processing: 
'Source','TMC','End_Time','End_Lat','End_Lng','Distance(mi)','Description','Number','Street','Side','Zipcode','Country','Timezone','Airport_Code

In [36]:
#Dropping out the columns
df2 = df.drop(['Source','TMC','End_Time','End_Lat','End_Lng','Distance(mi)','Description','Number','Street','Weather_Timestamp','Side','Zipcode','Country','Timezone','Airport_Code','Wind_Direction','Wind_Speed(mph)','Weather_Condition'], axis=1)

In [37]:
#Change the column name into appropriate name
df3 = df2.rename(columns = {"Start_Time":"Time","Start_Lat":"Latitude","Start_Lng":"Longitude"})
df3 #Show the modified data

Unnamed: 0,ID,Severity,Time,Latitude,Longitude,City,County,State,Temperature(F),Wind_Chill(F),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,3,2016-02-08 05:46:00,39.865147,-84.058723,Dayton,Montgomery,OH,36.9,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,2,2016-02-08 06:07:59,39.928059,-82.831184,Reynoldsburg,Franklin,OH,37.9,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,2,2016-02-08 06:49:27,39.063148,-84.032608,Williamsburg,Clermont,OH,36.0,33.3,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,3,2016-02-08 07:23:34,39.747753,-84.205582,Dayton,Montgomery,OH,35.1,31.0,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,2,2016-02-08 07:39:07,39.627781,-84.188354,Dayton,Montgomery,OH,36.0,33.3,...,False,False,False,False,True,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3513612,A-3513776,2,2019-08-23 18:03:25,34.002480,-117.379360,Riverside,Riverside,CA,86.0,86.0,...,False,False,False,False,False,False,Day,Day,Day,Day
3513613,A-3513777,2,2019-08-23 19:11:30,32.766960,-117.148060,San Diego,San Diego,CA,70.0,70.0,...,False,False,False,False,False,False,Day,Day,Day,Day
3513614,A-3513778,2,2019-08-23 19:00:21,33.775450,-117.847790,Orange,Orange,CA,73.0,73.0,...,False,False,False,False,False,False,Day,Day,Day,Day
3513615,A-3513779,2,2019-08-23 19:00:21,33.992460,-118.403020,Culver City,Los Angeles,CA,71.0,71.0,...,False,False,False,False,False,False,Day,Day,Day,Day


## Fixing data 


In [38]:
#Show a number of null or NaN values
df3.isnull().sum()

ID                             0
Severity                       0
Time                           0
Latitude                       0
Longitude                      0
City                         112
County                         0
State                          0
Temperature(F)             65732
Wind_Chill(F)            1868249
Humidity(%)                69687
Pressure(in)               55882
Visibility(mi)             75856
Precipitation(in)        2025874
Amenity                        0
Bump                           0
Crossing                       0
Give_Way                       0
Junction                       0
No_Exit                        0
Railway                        0
Roundabout                     0
Station                        0
Stop                           0
Traffic_Calming                0
Traffic_Signal                 0
Turning_Loop                   0
Sunrise_Sunset               115
Civil_Twilight               115
Nautical_Twilight            115
Astronomic

## Fixing data
Since some columns contains a significant number of missing values however it cannot be dropped. 

The column City will be filled in as empty and the other numerical vaules will be remained as 0 



    These column vaules would be 0
    
    Temperature(F)             65732
    Wind_Chill(F)            1868249
    Humidity(%)                69687
    Pressure(in)               55882
    Visibility(mi)             75856
    Wind_Direction             58874
    Wind_Speed(mph)           454609
    Precipitation(in)        2025874
    Weather_Condition          76138



    These vaules would be Day
    
    Sunrise_Sunset               115
    Civil_Twilight               115
    Nautical_Twilight            115
    Astronomical_Twilight        115



In [44]:
# Replace missing values with a string
df3['City'].fillna('Empty', inplace=True)

df3['Temperature(F)'].fillna(0, inplace=True)
df3['Wind_Chill(F)'].fillna(0, inplace=True)
df3['Humidity(%)'].fillna(0, inplace=True)
df3['Pressure(in)'].fillna(0, inplace=True)
df3['Visibility(mi)'].fillna(0, inplace=True)
df3['Precipitation(in)'].fillna(0, inplace=True)

# Replace missing string values
df3['Sunrise_Sunset'].fillna('Day', inplace=True)
df3['Civil_Twilight'].fillna('Day', inplace=True)
df3['Nautical_Twilight'].fillna('Day', inplace=True)
df3['Astronomical_Twilight'].fillna('Day', inplace=True)

In [45]:
df3.isnull().sum() # Check the target columns is fixed

ID                       0
Severity                 0
Time                     0
Latitude                 0
Longitude                0
City                     0
County                   0
State                    0
Temperature(F)           0
Wind_Chill(F)            0
Humidity(%)              0
Pressure(in)             0
Visibility(mi)           0
Precipitation(in)        0
Amenity                  0
Bump                     0
Crossing                 0
Give_Way                 0
Junction                 0
No_Exit                  0
Railway                  0
Roundabout               0
Station                  0
Stop                     0
Traffic_Calming          0
Traffic_Signal           0
Turning_Loop             0
Sunrise_Sunset           0
Civil_Twilight           0
Nautical_Twilight        0
Astronomical_Twilight    0
dtype: int64

In [48]:
#Export the dataset as CSV file
df3.to_csv('out.csv', index=False)  