# Aviation Exploration [placeholder title]

## Goals

In [106]:
# All imports
# reminder to add ignore warning code at the end

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### Data

In [107]:
# Open data file

# Had to specify encoding 'latin-1' for file to load
# Setting low_memory=False in the read_csv function will force pandas to read the entire file at once, 
# which can help with inferring the correct data types. 

ad = pd.read_csv('Data/AviationData.csv', encoding='latin-1', low_memory=False)

In [108]:
# Get initial info

ad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88889 entries, 0 to 88888
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                88889 non-null  object 
 1   Investigation.Type      88889 non-null  object 
 2   Accident.Number         88889 non-null  object 
 3   Event.Date              88889 non-null  object 
 4   Location                88837 non-null  object 
 5   Country                 88663 non-null  object 
 6   Latitude                34382 non-null  object 
 7   Longitude               34373 non-null  object 
 8   Airport.Code            50249 non-null  object 
 9   Airport.Name            52790 non-null  object 
 10  Injury.Severity         87889 non-null  object 
 11  Aircraft.damage         85695 non-null  object 
 12  Aircraft.Category       32287 non-null  object 
 13  Registration.Number     87572 non-null  object 
 14  Make                    88826 non-null

In [109]:
ad.sample(n=10)

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
69698,20110321X93124,Accident,WPR11LA175,2011-03-21,"Spanish Fork, UT",United States,040840N,1113956W,U77,Spanish Fork,...,Instructional,Amehigh LLC,0.0,0.0,1.0,0.0,VMC,,The student pilot's inadequate compensation fo...,25-09-2020
11725,20001214X36368,Accident,ATL85LA172,1985-05-21,"MILLEDGEVILLE, GA",United States,,,MLJ,BALDWIN CO.,...,Personal,,0.0,0.0,0.0,1.0,VMC,Takeoff,Probable Cause,
81741,20180616X95948,Accident,CEN18LA228,2018-06-16,"Greenwood, IL",United States,422410N,0882230W,10C,Galt Field Airport,...,Personal,Pilot,0.0,2.0,0.0,0.0,VMC,,The pilot's decision to take off with insuffic...,25-09-2020
39855,20001208X06000,Accident,FTW96TA261,1996-06-18,"PAGOSA SPRINGS, CO",United States,,,,,...,Public Aircraft,,0.0,0.0,1.0,0.0,VMC,Maneuvering,Probable Cause,31-03-1998
62,20020917X02247,Accident,LAX82DVG13,1982-01-09,"CALISTOGA, CA",United States,,,,CALISTOGA,...,Personal,,0.0,0.0,0.0,1.0,VMC,Landing,Probable Cause,09-01-1983
50836,20010628X01278,Accident,MIA01LA170,2001-06-23,"LABELLE, FL",United States,,,,,...,Personal,,2.0,,,,VMC,Approach,Probable Cause,25-04-2002
87341,20220210104623,Accident,GAA22WA097,2022-01-19,"Florianopolis, OF",Brazil,272536S,0048288W,,,...,,,0.0,3.0,0.0,0.0,,,,09-08-2022
74657,20140114X32510,Accident,WPR14LA096,2014-01-14,"Pismo Beach, CA",United States,035620N,1203927W,,,...,Personal,,2.0,0.0,0.0,0.0,VMC,,The in-flight separation of a wing for reasons...,25-09-2020
67192,20090918X20230,Accident,ANC09CA101,2009-09-17,"Tok, AK",United States,063190N,1425722W,,,...,Personal,KIM B WRIGHT,0.0,0.0,0.0,1.0,VMC,,The pilot's selection of an unsuitable landing...,25-09-2020
78639,20160721X13557,Accident,CEN16FA278,2016-07-21,"Fairmont, OK",United States,362220N,0973939W,,,...,Personal,,2.0,0.0,0.0,0.0,VMC,,The pilot's delay in recovering from an aeroba...,25-09-2020


Initial observations:

- 88888 rows and 31 columns
- 5 floats and 26 objects
- Column specifics denoted by a '.' followed by specification ('Airport.Code' & 'Airport.Name')
- Columns that may need to be adjusted to different dtype
    - Accident.Number -> int or float
    - Event.Date -> date type
    - Publication.Date -> date type
- Columns that will need capitalization adjusted
    - Location
    - Air.carrier
    - 

In [154]:
# Assess the missingness

raw_na = ad.isna().sum()
raw_na

Event.Id                      0
Investigation.Type            0
Accident.Number               0
Event.Date                    0
Location                      0
Country                       0
Latitude                  53338
Longitude                 53347
Airport.Code              37496
Airport.Name              34991
Injury.Severity               0
Aircraft.damage            2639
Aircraft.Category         56103
Registration.Number        1178
Make                          0
Model                         0
Amateur.Built                 0
Number.of.Engines          5212
Engine.Type                6136
FAR.Description           56309
Schedule                  75464
Purpose.of.flight             0
Air.carrier               71390
Total.Fatal.Injuries      11299
Total.Serious.Injuries    12378
Total.Minor.Injuries      11797
Total.Uninjured            5813
Weather.Condition          3480
Broad.phase.of.flight     25991
Report.Status              5442
Publication.Date          13415
dtype: i

In [157]:
# Assess missingness by percentage

perc_na = ad.isna().sum()/len(ad))*100
perc_na

Event.Id                   0.000000
Investigation.Type         0.000000
Accident.Number            0.000000
Event.Date                 0.000000
Location                   0.000000
Country                    0.000000
Latitude                  61.008613
Longitude                 61.018907
Airport.Code              42.888353
Airport.Name              40.023105
Injury.Severity            0.000000
Aircraft.damage            3.018518
Aircraft.Category         64.171251
Registration.Number        1.347410
Make                       0.000000
Model                      0.000000
Amateur.Built              0.000000
Number.of.Engines          5.961545
Engine.Type                7.018427
FAR.Description           64.406877
Schedule                  86.316584
Purpose.of.flight          0.000000
Air.carrier               81.656696
Total.Fatal.Injuries      12.923925
Total.Serious.Injuries    14.158098
Total.Minor.Injuries      13.493543
Total.Uninjured            6.648976
Weather.Condition          3

#### Initial thoughts
- Might be able to create subset of injuries into one column
Total.Fatal.Injuries      11401
Total.Serious.Injuries    12510
Total.Minor.Injuries      11933
- Drop missing rows from
    - Location
    - Country
    - Make
    - Model
    - Amateur.Built
    - Injury.Severity
- Columns to drop that do not contribute to insights regarding aircrafts 
    - Latitude
    - Longitude
    - Airport.Code & Airport.Name
        - Cound include in Next Steps (which airports to focus on and pull separate data on that)
    - Schedule
    - Purpose.of.flight (is this something that can be helpful for next stage of business development?)
- Create subset analysis
    - Weather.Condition (how do aircrafts wheather in harsh conditions?)

#### Explore columns before deciding how to handle
- Aircraft.Category
- Registration.Number
- Number.of.Engins
- Engin.Type
- FAR.Description
- Schedule
- Air.Carrier
- Broad.phase.of.flight
- Report.Status
- Publication.Date (vs Event.Date which has 0 na)

In [111]:
# Dropping rows of columns with few missing data
# Will not impact overall analysis

ad.dropna(inplace=True, subset=['Location', 'Country', 'Make', 'Model','Amateur.Built','Injury.Severity'])
ad.isna().sum()

Event.Id                      0
Investigation.Type            0
Accident.Number               0
Event.Date                    0
Location                      0
Country                       0
Latitude                  53338
Longitude                 53347
Airport.Code              37496
Airport.Name              34991
Injury.Severity               0
Aircraft.damage            2639
Aircraft.Category         56103
Registration.Number        1178
Make                          0
Model                         0
Amateur.Built                 0
Number.of.Engines          5212
Engine.Type                6136
FAR.Description           56309
Schedule                  75464
Purpose.of.flight          5132
Air.carrier               71390
Total.Fatal.Injuries      11299
Total.Serious.Injuries    12378
Total.Minor.Injuries      11797
Total.Uninjured            5813
Weather.Condition          3480
Broad.phase.of.flight     25991
Report.Status              5442
Publication.Date          13415
dtype: i

In [112]:
ad.shape

(87427, 31)

In [113]:
ad.head()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.922223,-81.878056,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980


#### Exploration of Column Values

In [114]:
# Relevant to analysis --> Keep

""" 

An accident is when damage is done to an aircraft or person whereas 
an incident is an event the could have caused an accident.


"""

ad['Investigation.Type'].value_counts()

Accident    84313
Incident     3114
Name: Investigation.Type, dtype: int64

In [115]:
# Not relevant to analysis --> Drop

ad['Schedule'].value_counts()

NSCH    4314
UNK     4082
SCHD    3567
Name: Schedule, dtype: int64

In [116]:
# Lowercase all letters

ad['Make'].value_counts()

Cessna              22152
Piper               11987
CESSNA               4838
Beech                4313
PIPER                2813
                    ...  
Brault                  1
Baldwin                 1
Kirchner                1
1977 Colfer-chan        1
ROYSE RALPH L           1
Name: Make, Length: 8192, dtype: int64

In [117]:
# Can imputate and na to 'Unknown' & adjust capitalization --> Keep

ad['Purpose.of.flight'].value_counts()

Personal                     49317
Instructional                10575
Unknown                       6644
Aerial Application            4709
Business                      3996
Positioning                   1626
Other Work Use                1259
Ferry                          804
Aerial Observation             784
Public Aircraft                717
Executive/corporate            546
Flight Test                    398
Skydiving                      182
External Load                  123
Public Aircraft - Federal      104
Banner Tow                     101
Air Race show                   99
Public Aircraft - Local         74
Public Aircraft - State         63
Air Race/show                   59
Glider Tow                      53
Firefighting                    40
Air Drop                        11
ASHO                             6
PUBS                             4
PUBL                             1
Name: Purpose.of.flight, dtype: int64

In [159]:
# Can imputate and na to 'Unknown' & adjust capitalization --> Keep

ad['Aircraft.damage'].value_counts()

Substantial    63763
Destroyed      18397
Minor           2536
Unknown           92
Name: Aircraft.damage, dtype: int64

#### Adjust dtype, Capitalization, and Imputate na to Separate Category

In [163]:
# Adjust na value to it's own category

ad['Purpose.of.flight'].fillna('Unknown', inplace=True)
ad['Aircraft.damage'].fillna('Unknown', inplace=True)
print(ad['Purpose.of.flight'].isna().sum())
print(ad['Aircraft.damage'].isna().sum())

0
0


In [120]:
# Adjust capitalization to lowercase

# Create function to lowercase selected columns

def lowercase(df):
    df = df.apply(lambda x: x.str.lower())
    return df

# Imutate on database to lowercase all relevant columns

ad[['Purpose.of.flight','Make']] = lowercase(ad[['Purpose.of.flight','Make']])
ad['Make']

0                            stinson
1                              piper
2                             cessna
3                           rockwell
4                             cessna
                    ...             
88882    grumman american avn. corp.
88883                    air tractor
88884                          piper
88886     american champion aircraft
88888                          piper
Name: Make, Length: 87427, dtype: object

In [122]:
ad['Make']

0                            stinson
1                              piper
2                             cessna
3                           rockwell
4                             cessna
                    ...             
88882    grumman american avn. corp.
88883                    air tractor
88884                          piper
88886     american champion aircraft
88888                          piper
Name: Make, Length: 87427, dtype: object

#### Drop Irrelevant Columns

In [123]:
# Create copy of original data for column dropping

"""
These columns do not add value to analysis and are missing alot of data
    - Latitude
    - Longitude
    - Airport.Code
    - Airport.Name
    - Schedule
    
Cound include in Next Steps (which airports to focus on and pull separate data on that)
"""

ad_1 = ad.drop(['Airport.Code', 'Airport.Name', 'Schedule'],axis=1)