# EDA on NTSB

## This notebook is for downloading and performing basic EDA on the NTSB data file.

### Data Source: https://www.ntsb.gov/_layouts/ntsb.aviation/index.aspx
### File Format: pipe delimited '|'

### Note: Pay close attention to all the extraneous spaces in the column names and values.
*ie. |(space)value(space)|*

### Import Python Libraries

In [None]:
%matplotlib inline
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from urllib.request import urlretrieve

### Download the data file from NTSB.

In [None]:
# create "data" sub directory (if not exists)
if not os.path.isdir("./data"):
    os.makedirs('./data')

data_file_uri = "http://app.ntsb.gov/aviationquery/Download.ashx?type=csv"
# urlretrieve(data_file_uri, './data/AviationData.txt') 

### Import the downloaded data into Pandas dataframe.

In [None]:
filename = 'data/AviationData.txt' 
data = pd.read_csv(filename, delimiter='|')

### Show some basic stats about the dataframe.

In [None]:
# row and column count
data.shape

In [None]:
# show the column names (note all the extraneous spaces in the column names)
data.columns

In [None]:
# What's up with the blank column at the end? Let's drop it.
data.drop(" ", axis=1, inplace=True)

In [None]:
data.info()

In [None]:
# summary statistics (with strings too)
data.describe(include="all")

In [None]:
# show the first 10 rows
data.head(10)

### Change the index to the "Event Date" column (as datetime)

In [None]:
# what is the current index?
data.index

In [None]:
# set the index to Event Date column and change type datetime
data[' Event Date '] = pd.to_datetime(data[' Event Date '], format=' %m/%d/%Y ')
datetime_index = pd.DatetimeIndex(data[' Event Date '])
data.set_index(' Event Date ', inplace=True)

In [None]:
# now what is the index?
data.index

In [None]:
# order by Event Date asc
data.sort_values(by=[' Event Date '], inplace=True, ascending=True)
data.info()

### Strip all the whitespace around the values

In [None]:
data = data.apply(lambda x: x.str.strip())

#for x in data[' Total Uninjured ']:
#    print(f"'{x}'")

data.head()

### Change the data types of the columns

In [None]:
# Float
data[' Latitude '] = pd.to_numeric(data[' Latitude '], downcast="float")
data[' Longitude '] = pd.to_numeric(data[' Longitude '], downcast="float")

# Boolean
data[' Amateur Built '] = data[' Amateur Built '].astype(bool)

# Integers
#values = {
#    ' Number of Engines ': -1, 
#    ' Total Fatal Injuries ': -1, 
#    ' Total Serious Injuries ': -1, 
#    ' Total Minor Injuries ': -1,  
#    ' Total Uninjured ': -1,
#    ' Publication Date ': '1/1/1800',
#}
#data.fillna(value=values, inplace=True) 

#data[' Number of Engines '] = data[' Number of Engines '].astype("int64")
#data[' Total Fatal Injuries '] = data[' Total Fatal Injuries '].astype("int64")
#data[' Total Serious Injuries '] = data[' Total Serious Injuries '].astype("int64")
#data[' Total Minor Injuries '] = data[' Total Minor Injuries '].astype("int64")
#data[' Total Uninjured '] = data[' Total Uninjured '].astype("int64")

data[' Number of Engines '] = pd.to_numeric(data[' Number of Engines '], downcast="integer")
data[' Total Fatal Injuries '] = pd.to_numeric(data[' Total Fatal Injuries '], downcast="integer")
data[' Total Serious Injuries '] = pd.to_numeric(data[' Total Serious Injuries '], downcast="integer")
data[' Total Minor Injuries '] = pd.to_numeric(data[' Total Minor Injuries '], downcast="integer")
data[' Total Uninjured '] = pd.to_numeric(data[' Total Uninjured '], downcast="integer")

# DateTime
#data[' Publication Date '] = pd.to_datetime(data[' Publication Date '], format=' %m/%d/%Y ')

data.info()

In [None]:
data[' Total Uninjured '].plot(title='# Total Uninjured')

In [None]:
data[' Total Minor Injuries '].plot(title='# Total Minor Injuries')

In [None]:
data[' Total Fatal Injuries '].plot(title='# Total Fatal Injuries')

## Column Names
`Index(['Event Id ', ' Investigation Type ', ' Accident Number ',
       ' Event Date ', ' Location ', ' Country ', ' Latitude ', ' Longitude ',
       ' Airport Code ', ' Airport Name ', ' Injury Severity ',
       ' Aircraft Damage ', ' Aircraft Category ', ' Registration Number ',
       ' Make ', ' Model ', ' Amateur Built ', ' Number of Engines ',
       ' Engine Type ', ' FAR Description ', ' Schedule ',
       ' Purpose of Flight ', ' Air Carrier ', ' Total Fatal Injuries ',
       ' Total Serious Injuries ', ' Total Minor Injuries ',
       ' Total Uninjured ', ' Weather Condition ', ' Broad Phase of Flight ',
       ' Report Status ', ' Publication Date ', ' '],
      dtype='object')`