# EDA on NTSB

## This notebook is for downloading and performing basic EDA on the NTSB data file.

### Data Source: https://www.ntsb.gov/_layouts/ntsb.aviation/index.aspx
### File Format: pipe delimited '|'

### Import Python Libraries

In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from urllib.request import urlretrieve

### Download the data file from NTSB.

In [2]:
# create "data" sub directory (if not exists)
if not os.path.isdir("./data"):
    os.makedirs('./data')

data_file_uri = "http://app.ntsb.gov/aviationquery/Download.ashx?type=csv"
urlretrieve(data_file_uri, './data/AviationData.txt') 

('./data/AviationData.txt', <http.client.HTTPMessage at 0x214ab5c9df0>)

### Import the downloaded data into Pandas dataframe.

In [3]:
filename = 'data/AviationData.txt' 
data = pd.read_csv(filename, delimiter='|')

### Show some basic stats about the dataframe.

In [4]:
# row and column count
data.shape

(84778, 32)

In [5]:
# show the column names (note all the extraneous spaces in the column names)
data.columns

Index(['Event Id ', ' Investigation Type ', ' Accident Number ',
       ' Event Date ', ' Location ', ' Country ', ' Latitude ', ' Longitude ',
       ' Airport Code ', ' Airport Name ', ' Injury Severity ',
       ' Aircraft Damage ', ' Aircraft Category ', ' Registration Number ',
       ' Make ', ' Model ', ' Amateur Built ', ' Number of Engines ',
       ' Engine Type ', ' FAR Description ', ' Schedule ',
       ' Purpose of Flight ', ' Air Carrier ', ' Total Fatal Injuries ',
       ' Total Serious Injuries ', ' Total Minor Injuries ',
       ' Total Uninjured ', ' Weather Condition ', ' Broad Phase of Flight ',
       ' Report Status ', ' Publication Date ', ' '],
      dtype='object')

In [6]:
# What's up with the blank column at the end? Let's drop it.
data.drop(" ", axis=1, inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84778 entries, 0 to 84777
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Event Id                  84778 non-null  object
 1    Investigation Type       84778 non-null  object
 2    Accident Number          84778 non-null  object
 3    Event Date               84778 non-null  object
 4    Location                 84778 non-null  object
 5    Country                  84778 non-null  object
 6    Latitude                 84778 non-null  object
 7    Longitude                84778 non-null  object
 8    Airport Code             84778 non-null  object
 9    Airport Name             84778 non-null  object
 10   Injury Severity          84778 non-null  object
 11   Aircraft Damage          84778 non-null  object
 12   Aircraft Category        84778 non-null  object
 13   Registration Number      84778 non-null  object
 14   Make                 

In [8]:
# summary statistics (with strings too)
data.describe(include="all")

Unnamed: 0,Event Id,Investigation Type,Accident Number,Event Date,Location,Country,Latitude,Longitude,Airport Code,Airport Name,...,Purpose of Flight,Air Carrier,Total Fatal Injuries,Total Serious Injuries,Total Minor Injuries,Total Uninjured,Weather Condition,Broad Phase of Flight,Report Status,Publication Date
count,84778,84778,84778,84778,84778,84778,84778.0,84778.0,84778.0,84778.0,...,84778,84778.0,84778,84778,84778,84778,84778,84778,84778,84778.0
unique,83539,3,84778,13818,26747,184,20645.0,22257.0,10048.0,23985.0,...,23,3056.0,128,41,64,377,4,13,4,3984.0
top,20001212X19172,Accident,SEA85FA040,05/16/1982,"ANCHORAGE, AK",United States,,,,,...,Personal,,0,0,0,1,VMC,LANDING,Probable Cause,
freq,3,81222,1,25,372,79146,54233.0,54242.0,36694.0,30885.0,...,47358,80537.0,40061,42631,40028,23990,74538,20795,77645,14530.0


In [9]:
# show the first 10 rows
data.head(10)

Unnamed: 0,Event Id,Investigation Type,Accident Number,Event Date,Location,Country,Latitude,Longitude,Airport Code,Airport Name,...,Purpose of Flight,Air Carrier,Total Fatal Injuries,Total Serious Injuries,Total Minor Injuries,Total Uninjured,Weather Condition,Broad Phase of Flight,Report Status,Publication Date
0,20200410X12023,Accident,ERA20CA151,04/10/2020,"Townsend, TN",United States,35.570834,83.585,,,...,Personal,,,,,,,,Preliminary,04/14/2020
1,20200411X15400,Accident,CEN20LA145,04/10/2020,"Sidney, IA",United States,40.826111,-95.670278,,,...,Personal,,,,,2.0,VMC,,Preliminary,04/14/2020
2,20200413X03920,Accident,CEN20CA146,04/10/2020,"Forth Worth, TX",United States,32.829167,-97.535,2TE2,,...,Personal,,,,2.0,,VMC,,Preliminary,04/15/2020
3,20200410X50413,Accident,ANC20CA042,04/10/2020,"Homer, AK",United States,59.645555,-151.476389,HOM,HOMER,...,Personal,,,,,1.0,,,Preliminary,04/10/2020
4,20200408X83712,Accident,CEN20LA143,04/07/2020,"Pecan Gap, TX",United States,,,,,...,Aerial Application,,1.0,,,,VMC,MANEUVERING,Preliminary,04/15/2020
5,20200408X83126,Accident,ANC20CA041,04/07/2020,"Chickaloon, AK",United States,61.539722,-147.134444,,,...,Personal,,,,,2.0,,,Preliminary,04/08/2020
6,20200407X53436,Accident,CEN20CA144,04/06/2020,"Seymour, TX",United States,33.646666,-99.259444,,,...,Other Work Use,,,,,1.0,VMC,,Preliminary,04/15/2020
7,20200406X24210,Accident,CEN20CA138,04/02/2020,"Hessel, MI",United States,46.035555,-84.417778,5Y1,Albert J Lindberg,...,Personal,,,,,1.0,VMC,LANDING,Factual,04/15/2020
8,20200401X50400,Accident,ERA20CA146,04/01/2020,"Hollywood, FL",United States,26.001111,-80.240833,HWO,North Perry,...,Instructional,,,,,3.0,VMC,,Preliminary,04/14/2020
9,20200401X12025,Accident,ERA20CA143,04/01/2020,"Palm Coast, FL",United States,29.465278,-81.207778,FIN,Flagler Executive,...,Instructional,,,,,2.0,VMC,GO-AROUND,Factual,04/15/2020


### Change the index to the "Event Date" column (as datetime)

In [10]:
# what is the current index?
data.index

RangeIndex(start=0, stop=84778, step=1)

In [73]:
# data.reset_index(inplace=True)
datetime_index = pd.DatetimeIndex(data.index)
data.set_index(' Event Date ', inplace=True)

In [74]:
# now what is the index?
data.index

DatetimeIndex(['1982-01-01', '1982-01-01', '1982-01-01', '1982-01-01',
               '1982-01-01', '1983-01-01', '1983-01-01', '1983-01-01',
               '1983-01-01', '1983-01-01',
               ...
               '2016-12-31', '2016-12-31', '2017-12-31', '2017-12-31',
               '2018-12-31', '2019-12-31', '2019-12-31', '2019-12-31',
               '2019-12-31', '2019-12-31'],
              dtype='datetime64[ns]', name=' Event Date ', length=84778, freq=None)

In [85]:
data.index.isnull().sum()
data.index.isna().sum();

In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 84778 entries, 1982-01-01 to 2019-12-31
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0    Accident Number          84778 non-null  object        
 1    Air Carrier              84778 non-null  object        
 2    Aircraft Category        84778 non-null  object        
 3    Aircraft Damage          84778 non-null  object        
 4    Airport Code             84778 non-null  object        
 5    Airport Name             84778 non-null  object        
 6    Amateur Built            84778 non-null  bool          
 7    Broad Phase of Flight    84778 non-null  object        
 8    Country                  84778 non-null  object        
 9    Engine Type              84778 non-null  object        
 10   FAR Description          84778 non-null  object        
 11   Injury Severity          84778 non-null  object        
 12   

### Change the data types of the columns

In [42]:
# data[' Latitude '] = pd.to_numeric(data[' Latitude '], downcast="float").fillna(0)
# data[' Longitude '] = pd.to_numeric(data[' Longitude '], downcast="float").fillna(0)

# Boolean
data[' Amateur Built '] = data[' Amateur Built '].astype(bool)
# Integers
data[' Number of Engines '] = pd.to_numeric(data[' Number of Engines '], downcast="integer", errors='coerce').fillna(0)
data[' Total Fatal Injuries '] = pd.to_numeric(data[' Total Fatal Injuries '], downcast="integer", errors='coerce').fillna(0)
data[' Total Serious Injuries '] = pd.to_numeric(data[' Total Serious Injuries '], downcast="integer", errors='coerce').fillna(0)
data[' Total Minor Injuries '] = pd.to_numeric(data[' Total Minor Injuries '], downcast="integer", errors='coerce').fillna(0)
data[' Total Uninjured '] = pd.to_numeric(data[' Total Uninjured '], downcast="integer", errors='coerce').fillna(0)
# DateTime
# data[' Event Date '] = pd.to_datetime(data[' Event Date '], format='%Y/%m/%d', errors='coerce')
data[' Publication Date '] = pd.to_datetime(data[' Publication Date '], format='%Y/%m/%d', errors='coerce')

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84778 entries,  01/01/1982  to  12/31/2019 
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0    Accident Number          84778 non-null  object        
 1    Air Carrier              84778 non-null  object        
 2    Aircraft Category        84778 non-null  object        
 3    Aircraft Damage          84778 non-null  object        
 4    Airport Code             84778 non-null  object        
 5    Airport Name             84778 non-null  object        
 6    Amateur Built            84778 non-null  bool          
 7    Broad Phase of Flight    84778 non-null  object        
 8    Country                  84778 non-null  object        
 9    Engine Type              84778 non-null  object        
 10   FAR Description          84778 non-null  object        
 11   Injury Severity          84778 non-null  object        
 12   Inve

In [65]:
# data.reset_index(inplace=True)

# data[' Event Date '] = data[' Event Date '].astype("datetime64")
data[' Event Date '] = pd.to_datetime(data[' Event Date '], format='%Y/%m/%d', errors='coerce')

# data.sort_index(axis=1, inplace=True)
print(data.index)

# data[' Total Uninjured '].plot(title='# Total Uninjured')

RangeIndex(start=0, stop=84778, step=1)


## Column Names
`Index(['Event Id ', ' Investigation Type ', ' Accident Number ',
       ' Event Date ', ' Location ', ' Country ', ' Latitude ', ' Longitude ',
       ' Airport Code ', ' Airport Name ', ' Injury Severity ',
       ' Aircraft Damage ', ' Aircraft Category ', ' Registration Number ',
       ' Make ', ' Model ', ' Amateur Built ', ' Number of Engines ',
       ' Engine Type ', ' FAR Description ', ' Schedule ',
       ' Purpose of Flight ', ' Air Carrier ', ' Total Fatal Injuries ',
       ' Total Serious Injuries ', ' Total Minor Injuries ',
       ' Total Uninjured ', ' Weather Condition ', ' Broad Phase of Flight ',
       ' Report Status ', ' Publication Date ', ' '],
      dtype='object')`