# Phase 1 Project



## Importing and loading data sets, initial inspection

In [14]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns


df = pd.read_csv('Data\AviationData.csv', encoding='latin-1', low_memory=False)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88889 entries, 0 to 88888
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                88889 non-null  object 
 1   Investigation.Type      88889 non-null  object 
 2   Accident.Number         88889 non-null  object 
 3   Event.Date              88889 non-null  object 
 4   Location                88837 non-null  object 
 5   Country                 88663 non-null  object 
 6   Latitude                34382 non-null  object 
 7   Longitude               34373 non-null  object 
 8   Airport.Code            50249 non-null  object 
 9   Airport.Name            52790 non-null  object 
 10  Injury.Severity         87889 non-null  object 
 11  Aircraft.damage         85695 non-null  object 
 12  Aircraft.Category       32287 non-null  object 
 13  Registration.Number     87572 non-null  object 
 14  Make                    88826 non-null

## Take a look at each column, see what stands out

In [16]:
# take a look at the value counts
for column in df.columns:
    print(column)
    print(f'{df[column].value_counts()} \n')

Event.Id
20001212X19172    3
20001214X45071    3
20220406104897    2
20020917X04070    2
20001208X07015    2
                 ..
20070615X00732    1
20070927X01456    1
20001212X19518    1
20001207X04208    1
20001211X13886    1
Name: Event.Id, Length: 87951, dtype: int64 

Investigation.Type
Accident    85015
Incident     3874
Name: Investigation.Type, dtype: int64 

Accident.Number
WPR22LA143    2
DCA22LA201    2
DCA23WA071    2
WPR23LA045    2
CEN22LA149    2
             ..
CHI83LA201    1
FTW90FA051    1
ANC83LA159    1
ATL89DEK01    1
NYC95LA168    1
Name: Accident.Number, Length: 88863, dtype: int64 

Event.Date
1984-06-30    25
2000-07-08    25
1982-05-16    25
1983-06-05    24
1983-08-05    24
              ..
2005-12-02     1
2012-11-08     1
2018-12-26     1
1987-07-01     1
1989-02-20     1
Name: Event.Date, Length: 14782, dtype: int64 

Location
ANCHORAGE, AK        434
MIAMI, FL            200
ALBUQUERQUE, NM      196
HOUSTON, TX          193
CHICAGO, IL          184
    

Landing        15428
Takeoff        12493
Cruise         10269
Maneuvering     8144
Approach        6546
Climb           2034
Taxi            1958
Descent         1887
Go-around       1353
Standing         945
Unknown          548
Other            119
Name: Broad.phase.of.flight, dtype: int64 

Report.Status
Probable Cause                                                                                                                                                                                                                                                                                                                                   61754
Foreign                                                                                                                                                                                                                                                                                                                                           1999
<br 

## Dropping these columns due to missingness and irrelevance to the analysis 

In [17]:
drop_columns = ['Latitude', 'Longitude', 'Schedule', 'Air.carrier', 'Airport.Code',
                'Airport.Name', 'FAR.Description', 'Publication.Date', 'Report.Status']
df = df.drop(columns=drop_columns)
df

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Injury.Severity,Aircraft.damage,Aircraft.Category,Registration.Number,...,Amateur.Built,Number.of.Engines,Engine.Type,Purpose.of.flight,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,Fatal(2),Destroyed,,NC6404,...,No,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,UNK,Cruise
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,Fatal(4),Destroyed,,N5069P,...,No,1.0,Reciprocating,Personal,4.0,0.0,0.0,0.0,UNK,Unknown
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,Fatal(3),Destroyed,,N5142R,...,No,1.0,Reciprocating,Personal,3.0,,,,IMC,Cruise
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,Fatal(2),Destroyed,,N1168J,...,No,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,IMC,Cruise
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,Fatal(1),Destroyed,,N15NY,...,No,,,Personal,1.0,2.0,,0.0,VMC,Approach
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88884,20221227106491,Accident,ERA23LA093,2022-12-26,"Annapolis, MD",United States,Minor,,,N1867H,...,No,,,Personal,0.0,1.0,0.0,0.0,,
88885,20221227106494,Accident,ERA23LA095,2022-12-26,"Hampton, NH",United States,,,,N2895Z,...,No,,,,0.0,0.0,0.0,0.0,,
88886,20221227106497,Accident,WPR23LA075,2022-12-26,"Payson, AZ",United States,Non-Fatal,Substantial,Airplane,N749PJ,...,No,1.0,,Personal,0.0,0.0,0.0,1.0,VMC,
88887,20221227106498,Accident,WPR23LA076,2022-12-26,"Morgan, UT",United States,,,,N210CU,...,No,,,Personal,0.0,0.0,0.0,0.0,,


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88889 entries, 0 to 88888
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                88889 non-null  object 
 1   Investigation.Type      88889 non-null  object 
 2   Accident.Number         88889 non-null  object 
 3   Event.Date              88889 non-null  object 
 4   Location                88837 non-null  object 
 5   Country                 88663 non-null  object 
 6   Injury.Severity         87889 non-null  object 
 7   Aircraft.damage         85695 non-null  object 
 8   Aircraft.Category       32287 non-null  object 
 9   Registration.Number     87572 non-null  object 
 10  Make                    88826 non-null  object 
 11  Model                   88797 non-null  object 
 12  Amateur.Built           88787 non-null  object 
 13  Number.of.Engines       82805 non-null  float64
 14  Engine.Type             81812 non-null

## Might want to keep these catagorical data due to potential relevance,
## recoding the NaN as 'NA'

In [19]:
# take a look at the value counts
for column in ['Aircraft.Category', 'Broad.phase.of.flight']:
    print(column)
    print(f'{df[column].value_counts()} \n')


Aircraft.Category
Airplane             27617
Helicopter            3440
Glider                 508
Balloon                231
Gyrocraft              173
Weight-Shift           161
Powered Parachute       91
Ultralight              30
Unknown                 14
WSFT                     9
Powered-Lift             5
Blimp                    4
UNK                      2
Rocket                   1
ULTR                     1
Name: Aircraft.Category, dtype: int64 

Broad.phase.of.flight
Landing        15428
Takeoff        12493
Cruise         10269
Maneuvering     8144
Approach        6546
Climb           2034
Taxi            1958
Descent         1887
Go-around       1353
Standing         945
Unknown          548
Other            119
Name: Broad.phase.of.flight, dtype: int64 



In [20]:
#re-coding missing data for these catagories as 'NA' just in case we want them for later
df = df.fillna({'Aircraft.Category': 'NA', 'Broad.phase.of.flight': 'NA'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88889 entries, 0 to 88888
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                88889 non-null  object 
 1   Investigation.Type      88889 non-null  object 
 2   Accident.Number         88889 non-null  object 
 3   Event.Date              88889 non-null  object 
 4   Location                88837 non-null  object 
 5   Country                 88663 non-null  object 
 6   Injury.Severity         87889 non-null  object 
 7   Aircraft.damage         85695 non-null  object 
 8   Aircraft.Category       88889 non-null  object 
 9   Registration.Number     87572 non-null  object 
 10  Make                    88826 non-null  object 
 11  Model                   88797 non-null  object 
 12  Amateur.Built           88787 non-null  object 
 13  Number.of.Engines       82805 non-null  float64
 14  Engine.Type             81812 non-null

## Get a better idea of how much is missing now

In [21]:
df.isna().sum()

Event.Id                      0
Investigation.Type            0
Accident.Number               0
Event.Date                    0
Location                     52
Country                     226
Injury.Severity            1000
Aircraft.damage            3194
Aircraft.Category             0
Registration.Number        1317
Make                         63
Model                        92
Amateur.Built               102
Number.of.Engines          6084
Engine.Type                7077
Purpose.of.flight          6192
Total.Fatal.Injuries      11401
Total.Serious.Injuries    12510
Total.Minor.Injuries      11933
Total.Uninjured            5912
Weather.Condition          4492
Broad.phase.of.flight         0
dtype: int64

## Now look at the injuries data

In [43]:
for col in ['Total.Fatal.Injuries', 'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Injuries']:
    print(f'{col} Value Counts:)
    print(df[col].value_counts())
    print('\n')
    print(df[col].describe())
    print('\n')
#print(df['Total.Fatal.Injuries'].value_counts())
#print(df['Total.Fatal.Injuries'].mean())
#print(df['Total.Fatal.Injuries'].median())
#print(df['Total.Fatal.Injuries'].mode())

SyntaxError: EOL while scanning string literal (<ipython-input-43-4b3f01573219>, line 2)

In [30]:
print(df['Total.Serious.Injuries'].value_counts())
print(df['Total.Serious.Injuries'].mean())
print(df['Total.Serious.Injuries'].median())
print(df['Total.Serious.Injuries'].mode())

0.0      63289
1.0       9125
2.0       2815
3.0        629
4.0        258
5.0         78
6.0         41
7.0         27
9.0         16
8.0         13
10.0        13
13.0         9
11.0         6
26.0         5
14.0         5
12.0         5
25.0         3
20.0         3
28.0         3
17.0         2
50.0         2
59.0         2
21.0         2
47.0         2
55.0         1
88.0         1
41.0         1
67.0         1
33.0         1
18.0         1
161.0        1
81.0         1
39.0         1
137.0        1
27.0         1
15.0         1
45.0         1
125.0        1
23.0         1
44.0         1
106.0        1
22.0         1
34.0         1
16.0         1
35.0         1
53.0         1
43.0         1
63.0         1
19.0         1
60.0         1
Name: Total.Serious.Injuries, dtype: int64
0.27988059545162935
0.0
0    0.0
dtype: float64


In [31]:
print(df['Total.Minor.Injuries'].value_counts())
print(df['Total.Minor.Injuries'].mean())
print(df['Total.Minor.Injuries'].median())
print(df['Total.Minor.Injuries'].mode())

0.0      61454
1.0      10320
2.0       3576
3.0        784
4.0        372
5.0        129
6.0         67
7.0         59
9.0         22
8.0         20
13.0        14
12.0        11
10.0        11
14.0        10
11.0         9
17.0         8
18.0         6
19.0         6
22.0         5
24.0         5
15.0         4
33.0         4
16.0         4
25.0         4
23.0         3
21.0         3
32.0         3
20.0         3
27.0         3
26.0         3
30.0         2
36.0         2
42.0         2
28.0         2
38.0         2
50.0         2
31.0         2
43.0         1
39.0         1
65.0         1
47.0         1
57.0         1
58.0         1
29.0         1
45.0         1
62.0         1
71.0         1
200.0        1
125.0        1
96.0         1
69.0         1
380.0        1
68.0         1
171.0        1
35.0         1
40.0         1
84.0         1
Name: Total.Minor.Injuries, dtype: int64
0.3570611778158948
0.0
0    0.0
dtype: float64
