## This notebook is for downloading and performing basic EDA on the NTSB data file.

### Source: https://www.ntsb.gov/_layouts/ntsb.aviation/index.aspx
### File Format: Pipe Delimited '|'

### Import Python Libraries

In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from urllib.request import urlretrieve

In [2]:
# create data sub directory (if not exists)
if not os.path.isdir("./data"):
    os.makedirs('./data')

data_file_uri = "http://app.ntsb.gov/aviationquery/Download.ashx?type=csv"
urlretrieve(data_file_uri, './data/AviationData.txt') 

('./data/AviationData.txt', <http.client.HTTPMessage at 0x1b0a1e60bb0>)

### Import the downloaded NTSB dataset into Pandas dataframe.

In [3]:
filename = 'data/AviationData.txt' 
data = pd.read_csv(filename, delimiter='|')

### Show some basic stats about the dataframe.

In [4]:
# row and column count
data.shape

(84778, 32)

In [5]:
# column names (node all the extraneous spaces)
data.columns

Index(['Event Id ', ' Investigation Type ', ' Accident Number ',
       ' Event Date ', ' Location ', ' Country ', ' Latitude ', ' Longitude ',
       ' Airport Code ', ' Airport Name ', ' Injury Severity ',
       ' Aircraft Damage ', ' Aircraft Category ', ' Registration Number ',
       ' Make ', ' Model ', ' Amateur Built ', ' Number of Engines ',
       ' Engine Type ', ' FAR Description ', ' Schedule ',
       ' Purpose of Flight ', ' Air Carrier ', ' Total Fatal Injuries ',
       ' Total Serious Injuries ', ' Total Minor Injuries ',
       ' Total Uninjured ', ' Weather Condition ', ' Broad Phase of Flight ',
       ' Report Status ', ' Publication Date ', ' '],
      dtype='object')

In [8]:
# summary statistics
data.describe()

Unnamed: 0,Event Id,Investigation Type,Accident Number,Event Date,Location,Country,Latitude,Longitude,Airport Code,Airport Name,...,Air Carrier,Total Fatal Injuries,Total Serious Injuries,Total Minor Injuries,Total Uninjured,Weather Condition,Broad Phase of Flight,Report Status,Publication Date,Unnamed: 21
count,84778,84778,84778,84778,84778,84778,84778.0,84778.0,84778.0,84778.0,...,84778.0,84778,84778,84778,84778,84778,84778,84778,84778.0,84778.0
unique,83539,3,84778,13818,26747,184,20644.0,22256.0,10048.0,23985.0,...,3056.0,128,41,64,377,4,13,4,3985.0,1.0
top,20001212X19172,Accident,ERA14WA442,05/16/1982,"ANCHORAGE, AK",United States,,,,,...,,0,0,0,1,VMC,LANDING,Probable Cause,,
freq,3,81222,1,25,372,79146,54234.0,54243.0,36694.0,30886.0,...,80537.0,40061,42631,40028,23991,74538,20795,77645,14530.0,84778.0


In [9]:
# show the first 10 rows
data.head(10)

Unnamed: 0,Event Id,Investigation Type,Accident Number,Event Date,Location,Country,Latitude,Longitude,Airport Code,Airport Name,...,Air Carrier,Total Fatal Injuries,Total Serious Injuries,Total Minor Injuries,Total Uninjured,Weather Condition,Broad Phase of Flight,Report Status,Publication Date,Unnamed: 21
0,20200410X12023,Accident,ERA20CA151,04/10/2020,"Townsend, TN",United States,35.570834,83.585,,,...,,,,,,,,Preliminary,04/14/2020,
1,20200411X15400,Accident,CEN20LA145,04/10/2020,"Sidney, IA",United States,40.826111,-95.670278,,,...,,,,,2.0,VMC,,Preliminary,04/14/2020,
2,20200413X03920,Accident,CEN20CA146,04/10/2020,"Forth Worth, TX",United States,32.829167,-97.535,2TE2,,...,,,,2.0,,VMC,,Preliminary,04/15/2020,
3,20200410X50413,Accident,ANC20CA042,04/10/2020,"Homer, AK",United States,59.645555,-151.476389,HOM,HOMER,...,,,,,1.0,,,Preliminary,04/10/2020,
4,20200408X83712,Accident,CEN20LA143,04/07/2020,"Pecan Gap, TX",United States,,,,,...,,1.0,,,,VMC,MANEUVERING,Preliminary,04/15/2020,
5,20200408X83126,Accident,ANC20CA041,04/07/2020,"Chickaloon, AK",United States,61.539722,-147.134444,,,...,,,,,2.0,,,Preliminary,04/08/2020,
6,20200407X53436,Accident,CEN20CA144,04/06/2020,"Seymour, TX",United States,33.646666,-99.259444,,,...,,,,,1.0,VMC,,Preliminary,04/15/2020,
7,20200406X24210,Accident,CEN20CA138,04/02/2020,"Hessel, MI",United States,46.035555,-84.417778,5Y1,Albert J Lindberg,...,,,,,1.0,VMC,LANDING,Factual,04/15/2020,
8,20200401X50400,Accident,ERA20CA146,04/01/2020,"Hollywood, FL",United States,26.001111,-80.240833,HWO,North Perry,...,,,,,3.0,VMC,,Preliminary,04/14/2020,
9,20200401X12025,Accident,ERA20CA143,04/01/2020,"Palm Coast, FL",United States,29.465278,-81.207778,FIN,Flagler Executive,...,,,,,2.0,VMC,GO-AROUND,Factual,04/15/2020,
