# Phase 1 Project

![TakeOff](./images/take_off.jpg)

## 1.  Importing and loading data sets, initial inspection

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('Data\AviationData.csv', encoding='latin-1', low_memory=False)
states_df = pd.read_csv('Data\\USState_Codes.csv')

In [78]:
# code the state names and abbreviations into the main df
states_dict = {}
for index, row in states_df.iterrows():
    states_dict[row[1]] = row[0]

    
def state_lookup(abbrev):
    if abbrev in states_dict.keys():
        return states_dict[abbrev]
    else:
        return None

df['State.Name'] = df['Location']
df['State.Abbrev'] = df['Location'].map(lambda x: str(x)[-2:])
df['State.Name'] = df['State.Abbrev'].map(lambda x: state_lookup(x) if state_lookup(x) != None else x)



In [79]:
df.head(3)

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date,State.Name,State.Abbrev
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,,Idaho,ID
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996,California,CA
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.922223,-81.878056,,,...,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007,Virginia,VA


In [80]:
df['State.Name'].value_counts().tail()

YA    1
3Q    1
8,    1
15    1
sk    1
Name: State.Name, dtype: int64

In [81]:
# take a look at the value counts
for column in df.columns:
    print(f'Column Name: {column}')
    print(f'{df[column].value_counts()} \n')

Column Name: Event.Id
20001212X19172    3
20001214X45071    3
20001214X36116    2
20040616X00799    2
20001208X05358    2
                 ..
20001214X40843    1
20001213X27342    1
20170821X14831    1
20070214X00186    1
20010831X01848    1
Name: Event.Id, Length: 87951, dtype: int64 

Column Name: Investigation.Type
Accident    85015
Incident     3874
Name: Investigation.Type, dtype: int64 

Column Name: Accident.Number
DCA22WA167    2
ERA22LA103    2
DCA22LA201    2
DCA23WA071    2
ERA22LA119    2
             ..
CHI85LA008    1
ERA17LA281    1
ERA11LA357    1
CHI95LA099    1
NYC92FA090    1
Name: Accident.Number, Length: 88863, dtype: int64 

Column Name: Event.Date
1982-05-16    25
1984-06-30    25
2000-07-08    25
1984-08-25    24
1983-06-05    24
              ..
2004-11-26     1
2015-02-26     1
2018-11-15     1
1974-08-30     1
2011-10-31     1
Name: Event.Date, Length: 14782, dtype: int64 

Column Name: Location
ANCHORAGE, AK                         434
MIAMI, FL             

## 1. Data Cleaning

In [82]:
#Dropping these columns due to Not enough data, not enough relevance
drop_columns = ['Latitude',          
                'Longitude',         
                'Schedule',          
                'Air.carrier',       
                'Airport.Code',      
                'FAR.Description',   
                'Publication.Date',  
                'Report.Status'      
               ]
df = df.drop(columns=drop_columns)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88889 entries, 0 to 88888
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                88889 non-null  object 
 1   Investigation.Type      88889 non-null  object 
 2   Accident.Number         88889 non-null  object 
 3   Event.Date              88889 non-null  object 
 4   Location                88837 non-null  object 
 5   Country                 88663 non-null  object 
 6   Airport.Name            52790 non-null  object 
 7   Injury.Severity         87889 non-null  object 
 8   Aircraft.damage         85695 non-null  object 
 9   Aircraft.Category       32287 non-null  object 
 10  Registration.Number     87572 non-null  object 
 11  Make                    88826 non-null  object 
 12  Model                   88797 non-null  object 
 13  Amateur.Built           88787 non-null  object 
 14  Number.of.Engines       82805 non-null

In [83]:
#Take a look at amateur built values
df['Amateur.Built'].value_counts()


No     80312
Yes     8475
Name: Amateur.Built, dtype: int64

In [84]:
#Take a look at amateur built NaN
df['Amateur.Built'].isna().sum()

102

In [85]:
# Stakeholders will not want to purchase airplanes built by amateurs for liability reasons
# dropping 'Amateur.Built' == 'Yes' or NaN 
df = df[(df['Amateur.Built'] == 'No') & (df['Amateur.Built'].notna())]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80312 entries, 0 to 88888
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                80312 non-null  object 
 1   Investigation.Type      80312 non-null  object 
 2   Accident.Number         80312 non-null  object 
 3   Event.Date              80312 non-null  object 
 4   Location                80265 non-null  object 
 5   Country                 80092 non-null  object 
 6   Airport.Name            47400 non-null  object 
 7   Injury.Severity         79313 non-null  object 
 8   Aircraft.damage         77165 non-null  object 
 9   Aircraft.Category       28721 non-null  object 
 10  Registration.Number     79101 non-null  object 
 11  Make                    80266 non-null  object 
 12  Model                   80245 non-null  object 
 13  Amateur.Built           80312 non-null  object 
 14  Number.of.Engines       74606 non-null

In [86]:
# Take a closer look at country
df['Country'].value_counts()


United States                    73906
Brazil                             367
Mexico                             348
Canada                             346
United Kingdom                     327
                                 ...  
Guernsey                             1
Eritrea                              1
Belarus                              1
St Vincent And The Grenadines        1
Wallis and Futuna                    1
Name: Country, Length: 217, dtype: int64

In [87]:
# Vast majority of this data set is from US crashes
# we shouldn't assume the model fits outside of US, removing non-US
df = df[df['Country'] == 'United States']
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73906 entries, 0 to 88888
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                73906 non-null  object 
 1   Investigation.Type      73906 non-null  object 
 2   Accident.Number         73906 non-null  object 
 3   Event.Date              73906 non-null  object 
 4   Location                73896 non-null  object 
 5   Country                 73906 non-null  object 
 6   Airport.Name            46285 non-null  object 
 7   Injury.Severity         73798 non-null  object 
 8   Aircraft.damage         71949 non-null  object 
 9   Aircraft.Category       24697 non-null  object 
 10  Registration.Number     73856 non-null  object 
 11  Make                    73897 non-null  object 
 12  Model                   73891 non-null  object 
 13  Amateur.Built           73906 non-null  object 
 14  Number.of.Engines       72237 non-null

In [88]:
# A lot of missing data in these catagorical columns but may be relevant data,
# so recoding the NaN as 'DATA MISSING' for now
missing_data_dict = {'Airport.Name': 'DATA MISSING',
                     'Aircraft.Category': 'DATA MISSING',
                     'Broad.phase.of.flight': 'DATA MISSING'}
df = df.fillna(missing_data_dict)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73906 entries, 0 to 88888
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                73906 non-null  object 
 1   Investigation.Type      73906 non-null  object 
 2   Accident.Number         73906 non-null  object 
 3   Event.Date              73906 non-null  object 
 4   Location                73896 non-null  object 
 5   Country                 73906 non-null  object 
 6   Airport.Name            73906 non-null  object 
 7   Injury.Severity         73798 non-null  object 
 8   Aircraft.damage         71949 non-null  object 
 9   Aircraft.Category       73906 non-null  object 
 10  Registration.Number     73856 non-null  object 
 11  Make                    73897 non-null  object 
 12  Model                   73891 non-null  object 
 13  Amateur.Built           73906 non-null  object 
 14  Number.of.Engines       72237 non-null

In [89]:
# Take a closer look at 'Aircraft.Catagory' due to relevance
df['Aircraft.Category'].value_counts()


DATA MISSING         49209
Airplane             21121
Helicopter            2593
Glider                 472
Balloon                227
Weight-Shift           139
Powered Parachute       82
Gyrocraft               31
Ultralight              13
WSFT                     9
Blimp                    4
Powered-Lift             3
Unknown                  2
Rocket                   1
Name: Aircraft.Category, dtype: int64

In [90]:
# Not much data outside of Airplane and Helicopter, and stakeholders want high volume
# for revenue, so dropping all other 'Aircraft.Category'
df = df[(df['Aircraft.Category'] == 'Airplane') | 
        (df['Aircraft.Category'] == 'Helicopter') | 
        (df['Aircraft.Category'] == 'DATA MISSING')]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72923 entries, 0 to 88888
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                72923 non-null  object 
 1   Investigation.Type      72923 non-null  object 
 2   Accident.Number         72923 non-null  object 
 3   Event.Date              72923 non-null  object 
 4   Location                72913 non-null  object 
 5   Country                 72923 non-null  object 
 6   Airport.Name            72923 non-null  object 
 7   Injury.Severity         72815 non-null  object 
 8   Aircraft.damage         71069 non-null  object 
 9   Aircraft.Category       72923 non-null  object 
 10  Registration.Number     72873 non-null  object 
 11  Make                    72914 non-null  object 
 12  Model                   72908 non-null  object 
 13  Amateur.Built           72923 non-null  object 
 14  Number.of.Engines       71400 non-null

In [91]:
# Take a closer look at 'Broad.phase.of.flight' due to relevance
df['Broad.phase.of.flight'].value_counts()


DATA MISSING    17542
Landing         14367
Takeoff         10919
Cruise           9164
Maneuvering      7090
Approach         5762
Taxi             1858
Climb            1798
Descent          1726
Go-around        1268
Standing          902
Unknown           428
Other              99
Name: Broad.phase.of.flight, dtype: int64

In [92]:
# 'Broad.phase.of.flight' catagories look ok
df['Broad.phase.of.flight'].value_counts()

DATA MISSING    17542
Landing         14367
Takeoff         10919
Cruise           9164
Maneuvering      7090
Approach         5762
Taxi             1858
Climb            1798
Descent          1726
Go-around        1268
Standing          902
Unknown           428
Other              99
Name: Broad.phase.of.flight, dtype: int64

In [93]:
# check out the data chronologically
df['Event.Date'] = pd.to_datetime(df['Event.Date'])
df['Event.Date'].dt.year.value_counts()


1982    3245
1983    3236
1984    3190
1985    2882
1986    2644
1987    2617
1988    2494
1989    2323
1990    2278
1991    2219
1992    2052
1993    2041
1995    1993
1994    1981
1996    1900
1999    1845
1997    1839
1998    1830
2000    1798
2003    1710
2001    1676
2002    1646
2005    1589
2007    1555
2004    1551
2008    1422
2006    1416
2011    1328
2010    1300
2009    1286
2012    1282
2016    1123
2013    1116
2018    1102
2017    1101
2019    1087
2015    1084
2022    1083
2014    1070
2021    1034
2020     948
1979       2
1977       1
1948       1
1981       1
1962       1
1974       1
Name: Event.Date, dtype: int64

In [94]:
#drop the very old one-offs, questionable data
df = df[df['Event.Date'].dt.year > 1981]
df['Event.Date'].dt.year.value_counts()

1982    3245
1983    3236
1984    3190
1985    2882
1986    2644
1987    2617
1988    2494
1989    2323
1990    2278
1991    2219
1992    2052
1993    2041
1995    1993
1994    1981
1996    1900
1999    1845
1997    1839
1998    1830
2000    1798
2003    1710
2001    1676
2002    1646
2005    1589
2007    1555
2004    1551
2008    1422
2006    1416
2011    1328
2010    1300
2009    1286
2012    1282
2016    1123
2013    1116
2018    1102
2017    1101
2019    1087
2015    1084
2022    1083
2014    1070
2021    1034
2020     948
Name: Event.Date, dtype: int64

In [95]:
# Get a better idea of how much is missing now
df.isna().sum()

Event.Id                     0
Investigation.Type           0
Accident.Number              0
Event.Date                   0
Location                    10
Country                      0
Airport.Name                 0
Injury.Severity            108
Aircraft.damage           1854
Aircraft.Category            0
Registration.Number         50
Make                         9
Model                       15
Amateur.Built                0
Number.of.Engines         1522
Engine.Type               2181
Purpose.of.flight         2419
Total.Fatal.Injuries      9343
Total.Serious.Injuries    9959
Total.Minor.Injuries      9358
Total.Uninjured           4038
Weather.Condition          597
Broad.phase.of.flight        0
State.Name                   0
State.Abbrev                 0
dtype: int64

## Now look at the injuries data

In [96]:
for col in ['Total.Fatal.Injuries', 'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured']:
    print(f'{col} Value Counts:')
    print(df[col].value_counts())
    print(f'\n{col} Descibed:')
    print(df[col].describe())
    print('\n')


Total.Fatal.Injuries Value Counts:
0.0      50975
1.0       6124
2.0       3863
3.0       1243
4.0        837
5.0        220
6.0        132
7.0         43
8.0         32
10.0        21
9.0         15
14.0         7
11.0         6
12.0         5
18.0         3
13.0         3
17.0         3
25.0         3
82.0         2
49.0         2
20.0         2
23.0         2
34.0         2
230.0        1
29.0         1
132.0        1
135.0        1
265.0        1
16.0         1
65.0         1
88.0         1
111.0        1
19.0         1
37.0         1
78.0         1
68.0         1
153.0        1
28.0         1
110.0        1
31.0         1
44.0         1
27.0         1
70.0         1
73.0         1
43.0         1
228.0        1
64.0         1
15.0         1
21.0         1
92.0         1
156.0        1
Name: Total.Fatal.Injuries, dtype: int64

Total.Fatal.Injuries Descibed:
count    63573.000000
mean         0.424834
std          2.570359
min          0.000000
25%          0.000000
50%          0.00

In [97]:
count_am_built_yes = len(df[df['Amateur.Built'] == 'Yes'])
count_am_built_no = len(df[df['Amateur.Built'] == 'No'])

uninjured_counts = list(df.groupby('Amateur.Built')['Total.Uninjured'].sum())
uninjured_counts[0] /= count_am_built_no
uninjured_counts[1] /= count_am_built_yes
uninjured_counts

IndexError: list index out of range

# Normalized by count, uninjured is higher for non-amateur built

In [98]:
df.to_csv('Data\AviationDataReduced.csv')