## This notebook is used for cleanup and formatting of NTSB data.

### Source: https://www.ntsb.gov/_layouts/ntsb.aviation/index.aspx
### File Format: Pipe Delimited '|'

Import the NTSB dataset

In [None]:
# import Pandas library
import pandas as pd 
filename = 'data/AviationData.txt' 
data = pd.read_csv(filename, delimiter='|')

In [None]:
# print shape of the dataframe
data.shape

In [None]:
# print first rows
data.head()

Drop the last column

In [None]:
data = data[data.columns[:-1]]

In [None]:
# print out columns of the df
data.columns

Split out the Location field into 'City' and 'State'.

In [None]:
# pd.concat([data, data[' Location '].str.partition(',')[[0, 2]]], axis=1)
splitLocation = data[' Location '].str.split(", ", expand=True,)
splitLocation = splitLocation.drop(columns=[2, 3, 4], axis=1)

Add split fields to DataFrame

In [None]:
# data = pd.concat([data, splitLocation])
data["City"] = splitLocation[0]
data["State"] = splitLocation[1]

Rename the new columns.

In [None]:
 data = data.rename(index=str, columns={0: "City", 1: "State"})

Drop the Location column

In [None]:
data = data.drop(columns=[' Location '])

Change the dates format to %Y-%m-%d (to match what Splunk expects)

In [None]:
# convert Event Date
data[' Event Date '] = pd.to_datetime(data[' Event Date '])
data[' Event Date '] = data[' Event Date '].dt.strftime('%Y-%m-%d')

# convert Publication Date
data[' Publication Date '] = pd.to_datetime(data[' Publication Date '], errors='coerce')
data[' Publication Date '] = data[' Publication Date '].dt.strftime('%Y-%m-%d')

Save the updated DataFrame as CSV

In [None]:
data.to_csv("output/AviationData.csv", sep=',', index=False)