In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/Aviation_Data.csv', low_memory=False)
print(f'Original df.shape -> {df.shape}')

Original df.shape -> (90348, 31)


In [3]:
print('Original df Null values:\n')
print(df.isna().sum())

Original df Null values:

Event.Id                   1459
Investigation.Type            0
Accident.Number            1459
Event.Date                 1459
Location                   1511
Country                    1685
Latitude                  55966
Longitude                 55975
Airport.Code              40099
Airport.Name              37558
Injury.Severity            2459
Aircraft.damage            4653
Aircraft.Category         58061
Registration.Number        2776
Make                       1522
Model                      1551
Amateur.Built              1561
Number.of.Engines          7543
Engine.Type                8536
FAR.Description           58325
Schedule                  77766
Purpose.of.flight          7651
Air.carrier               73700
Total.Fatal.Injuries      12860
Total.Serious.Injuries    13969
Total.Minor.Injuries      13392
Total.Uninjured            7371
Weather.Condition          5951
Broad.phase.of.flight     28624
Report.Status              7840
Publication.Da

In [4]:
# replace nan Aircraft.Category values with "Unknown"
df['Aircraft.Category'] = df['Aircraft.Category'].fillna('Unknown')

# drop non-Airplanes
df = df[df['Aircraft.Category'].isin(['Airplane', 'Unknown'])]
print(f'After dropping non-airplanes, df.shape -> {df.shape}')

After dropping non-airplanes, df.shape -> (85692, 31)


In [5]:
# drop null rows
null_rows = df[df['Event.Id'].isna()]
df = df.drop(null_rows.index)

# more null rows
null_rows_2 = df[df['Injury.Severity'] == 'Unavailable']
df = df.drop(null_rows_2.index)

print(f'After dropping {null_rows.shape[0] + null_rows_2.shape[0]} null rows, df.shape -> {df.shape}')

After dropping 1552 null rows, df.shape -> (84140, 31)


In [6]:
# drop null years
df = df.dropna(subset=['Event.Date'])

# add year column, convert to int
df['year'] = df['Event.Date'].str[:4].astype(int)

# only keep data from last 20 years
df = df[df['year'] >= 2003]

In [7]:
# drop columns we did not use
df = df.drop(['Event.Id', 'Accident.Number', 'Latitude', 'Longitude', 'Airport.Code', 'Airport.Name', 'Registration.Number', 
              'Amateur.Built', 'FAR.Description', 'Schedule', 'Air.carrier', 'Report.Status', 'Publication.Date'], axis=1)

# rename remaining columns
df.columns = [col.strip().lower().replace('.', '_') for col in df.columns]
df.shape

(30700, 19)

In [8]:
# replace null values in numerical columns with 0
for col in ['number_of_engines', 'total_fatal_injuries', 'total_serious_injuries',
       'total_minor_injuries', 'total_uninjured']:
    df[col] = df[col].fillna(0)
    df[col] = df[col].astype(int)
    
# replace null values in non-numerical columns
df = df.fillna('Unknown')

print('Cleaned df Null values:\n')
print(df.isna().sum())

Cleaned df Null values:

investigation_type        0
event_date                0
location                  0
country                   0
injury_severity           0
aircraft_damage           0
aircraft_category         0
make                      0
model                     0
number_of_engines         0
engine_type               0
purpose_of_flight         0
total_fatal_injuries      0
total_serious_injuries    0
total_minor_injuries      0
total_uninjured           0
weather_condition         0
broad_phase_of_flight     0
year                      0
dtype: int64


In [9]:
# add state column
df['state'] = df['location'].map(lambda x: x[-2:])

In [10]:
# condense "make" and "model" columns

# take first 2 words from each "make"
df['make'] = df['make'].str.split().apply(lambda x: ' '.join(x[:2]) if len(x) > 1 else ' '.join(x))

# remove whitespace, turn all "makes" and "models" to titlecase
df['make'] = df['make'].str.strip().str.title()
df['model'] = df['model'].str.strip().str.title()

# remove whitespace and dashes from "model" column
df['model'] = df['model'].str.replace('-', '')

# create unique make_model column for each specific airplane
df['make_model'] = df['make'] + ' ' + df['model']

In [11]:
# remap injury_severity column to binary value
df['injury_severity'] = df['total_fatal_injuries'].apply(lambda x: 'fatal' if x > 0 else 'non-fatal')

In [12]:
print(f'Final df.shape -> {df.shape}')

Final df.shape -> (30700, 21)


In [13]:
df.to_csv('data/final.csv')