# EDA on NTSB

## This notebook is for downloading and performing basic EDA on the NTSB data file.

### Data Source: https://www.ntsb.gov/_layouts/ntsb.aviation/index.aspx
### File Format: pipe delimited '|'

### Import Python Libraries

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from urllib.request import urlretrieve

In [None]:
# create data sub directory (if not exists)
if not os.path.isdir("./data"):
    os.makedirs('./data')

data_file_uri = "http://app.ntsb.gov/aviationquery/Download.ashx?type=csv"
urlretrieve(data_file_uri, './data/AviationData.txt') 

### Import the downloaded NTSB dataset into Pandas dataframe.

In [None]:
filename = 'data/AviationData.txt' 
data = pd.read_csv(filename, delimiter='|')

### Show some basic stats about the dataframe.

In [None]:
# row and column count
data.shape

In [None]:
# show the column names (note all the extraneous spaces in the column names)
data.columns

In [None]:
# What's up with the blank column at the end? Let's drop it.
data.drop(" ", axis=1, inplace=True)

In [None]:
data.info()

In [None]:
# summary statistics (with strings too)
data.describe(include="all")

In [None]:
# show the first 10 rows
data.head(10)

### Change the index to the "Event Date" column (as datetime)

In [None]:
# what is the current index?
data.index

In [None]:
# datetime_index = pd.DatetimeIndex(data.index)
data.set_index(' Event Date ', inplace=True)

In [None]:
# now what is the index?
data.index

In [None]:
data.info()

### Change the data types of the columns

In [None]:
# data[' Latitude '] = pd.to_numeric(data[' Latitude '], downcast="float").fillna(0)
# data[' Longitude '] = pd.to_numeric(data[' Longitude '], downcast="float").fillna(0)

# Boolean
data[' Amateur Built '] = data[' Amateur Built '].astype(bool)
# Integers
data[' Number of Engines '] = pd.to_numeric(data[' Number of Engines '], downcast="integer").fillna(0)
data[' Total Fatal Injuries '] = pd.to_numeric(data[' Total Fatal Injuries '], downcast="integer").fillna(0)
data[' Total Serious Injuries '] = pd.to_numeric(data[' Total Serious Injuries '], downcast="integer").fillna(0)
data[' Total Minor Injuries '] = pd.to_numeric(data[' Total Minor Injuries '], downcast="integer").fillna(0)
data[' Total Uninjured '] = pd.to_numeric(data[' Total Uninjured '], downcast="integer").fillna(0)
# DateTime
data[' Publication Date '] = pd.to_datetime(data[' Publication Date '], format='%Y/%m/%d', errors='coerce')
#data[' Event Date '] = pd.to_datetime(data[' Event Date '], format='%Y/%m/%d', errors='coerce')
data.info()