In [277]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [278]:
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams.update({'font.size': 22})

In [279]:
#set data frame from file
df = pd.read_csv('/Users/evarubin/Dev/thinkful/data/plane_crashes_data.csv')

In [280]:
df.head()

Unnamed: 0,month,hour,year,location,operator,route,type,aboard,fatalities,summary
0,1,12,1950,"Near Vacas, Bolivia",Military - Bolivian Air Force,Valle grande - Cochabamba,Douglas C-47,32,32,Crashed while en route in the Andes mountains ...
1,3,20,1950,"Minneapolis, Minnesota",Northwest Orient Airlines,Rochester Minn. - Minneapolis,Martin 202,13,13,"Crashed into a flag pole, well marked by red n..."
2,3,14,1950,"Llandow Airport, Cardiff, Wales",Fairflight Ltd.,Llandow - Dublin,Avro 689 Tudor 5,83,80,During the approach to Runway 28 at Llandow Ai...
3,4,23,1950,"Near Atsugi, Japan",Military - U.S. Air Force,Philippines - Japan,Douglas C-54D,35,35,Flew off its prescribed course and crashed int...
4,5,20,1950,"Lagens Air Force Base, Azores",Military - U.S. Air Force,Bermuda - England,Boeing B-29,16,16,Crashed while attempting to land after being d...


In [281]:
# What is the mean for number of people killed in plane crashes for the years 1980 to 2009
#(inclusive of both years, rounded to the nearest 100th decimal place)? 

In [282]:
mean_numbers_1980_2009 = df.loc[(df['year'] >= 1980)]

In [283]:
mean_numbers_1980_2009.mean()

month            6.567733
year          1995.346494
aboard          36.870660
fatalities      24.932607
dtype: float64

In [284]:
#For the entire time period (1950 to 2009), which month has seen the most plane crashes? *

In [285]:
df['month'].value_counts()

1     255
12    252
3     251
8     239
7     234
11    230
9     228
10    224
2     209
4     204
6     202
5     181
Name: month, dtype: int64

In [286]:
#What is the highest number of fatalities in a single crash?

In [287]:
df['fatalities'].max()

583

In [288]:
#What percentage of crashed flights have had 1 or more fatalities?

In [289]:
one_or_more_fatalities = df.loc[(df['fatalities'] > 0)]

In [290]:
one_or_more_fatalities.count()

month         2660
hour          2660
year          2660
location      2657
operator      2658
route         2279
type          2659
aboard        2660
fatalities    2660
summary       2647
dtype: int64

In [291]:
df.loc[(df['fatalities'])].count()
#could also do: df.count()

month         2709
hour          2709
year          2709
location      2709
operator      2709
route         2238
type          2709
aboard        2709
fatalities    2709
summary       2701
dtype: int64

# Get average per event or incident

In [292]:
#What is the percent of passengers (inclusive of crew)
#killed per plane crash for the entire period?
# crashes: 2709
# fatalities: 69,937
# aboard: 97,029
#(number of fatalities in plane crash / aboard)  = percentage of people who die in plane crash
# add up percentages for each crash and divide by count

In [293]:
#step 1: create another column called "avg" that calculates percentage of fatalities per crash
df['avg'] = df['fatalities']/df['aboard']

In [294]:
# step 2: Get the mean of the column
df['avg'].mean()

0.8231319223510564

In [295]:
# long way around: add avg fatalities for each crash (sum) and divide by number of crashes (count)
df['avg'].sum()/df['avg'].count()

0.8231319223510564

# Check for outliers

In [296]:
#find outliers
df.describe()

Unnamed: 0,month,year,aboard,fatalities,avg
count,2709.0,2709.0,2709.0,2709.0,2708.0
mean,6.554079,1981.927279,35.817276,25.816537,0.823132
std,3.532468,16.709136,54.306779,42.599461,0.308131
min,1.0,1950.0,0.0,0.0,0.0
25%,3.0,1968.0,5.0,3.0,0.75
50%,7.0,1983.0,15.0,9.0,1.0
75%,10.0,1997.0,44.0,29.0,1.0
max,12.0,2009.0,644.0,583.0,1.0


In [297]:
#continuing to check for outliers
df['fatalities'].value_counts()

2      315
3      247
4      179
1      162
5      134
6      108
10      68
7       64
8       63
9       54
11      50
0       49
13      43
18      42
16      40
21      38
14      37
20      36
23      35
17      34
12      31
15      28
25      28
19      26
24      25
28      24
30      23
26      23
29      22
34      22
      ... 
171      1
181      1
187      1
189      1
191      1
213      1
259      1
261      1
120      1
520      1
140      1
144      1
146      1
158      1
166      1
170      1
178      1
180      1
188      1
196      1
200      1
228      1
230      1
234      1
256      1
217      1
264      1
290      1
346      1
583      1
Name: fatalities, Length: 189, dtype: int64

# Remove Outliers using std

In [298]:
# remove all rows of the dataframe if the value 
# in the column <feature1> lies too far (!) from corresponding median (the median computed for the column <feature1>)
#filtering_rule_1  = (df.fatalities.median() - df.fatalities).abs( ) > 3 
filtered_df = df[df['fatalities'] > df['fatalities'].mean() + 3 * df['fatalities'].std()]

# get fatalities without outliers (further than 3 standard devs away from mean)
filtered_df

Unnamed: 0,month,hour,year,location,operator,route,type,aboard,fatalities,summary,avg
690,5,16,1968,"Kham Duc, Vietnam",Military - U.S. Air Force,,Lockheed C-130B Hercules,155,155,Shot down by enemy fire while attempting to ev...,1.000000
842,7,14,1971,"Near Morioko, Japan",All Nippon Airways / Japanese Air Force,Sapporo - Tokyo,Boeing B-727-281 / Air Force F86F,164,163,A Japan Air Self Defense Force F-86F fighter c...,0.993902
894,8,17,1972,"Near Konigs Wusterausen, East Germany",Interflug,Berlin - Birgas,Ilyushin IL-62,156,156,"In cargo bay 6, hot air, leaking from an air c...",1.000000
901,10,21,1972,"Near Krasnaya Polyana, USSR",Aeroflot,Leningrad - Moscow,Ilyushin IL-62,174,174,The aircraft crashed at the outer marker while...,1.000000
910,12,6,1972,"Tenerife, Canary Islands",Spantax,Tenerife - Munchen,Convair CV-990-30A-5 Coronado,155,155,Reached a height of 300 feet when the plane s...,1.000000
920,1,9,1973,"Kano, Nigeria",Alia Royal Jordanian Airlines,Jeddah - Lagos,Boeing B-707-3D3C,202,176,The landing gear collapsed after hitting a dep...,0.871287
978,3,11,1974,"Near Ermenonville, France",Turkish Airlines (THY),Paris - London,McDonnell Douglas DC-10-10,346,346,The aircraft crashed shortly after takeoff fro...,1.000000
1013,12,22,1974,"Near Maskeliya, Sri Lanka",Martinair Holland NV,Surabaya - Jeddah,McDonnell Douglas DC-8-55F,191,191,The aircraft impacted high ground during appro...,1.000000
1028,4,16,1975,"Saigon, Vietnam",Military - U.S. Air Force,Tan Son AFB - Clark AFB,Lockheed C-5A Galaxy,330,155,The aircraft reported it was returning to Tan ...,0.469697
1038,8,4,1975,"Near Immouzer, Morocco",Alia Royal Jordanian Airlines,Paris - Agadir,Boeing B-707-321C,188,188,The aircraft was on approach when the right wi...,1.000000


In [299]:
filtered_df.describe()

Unnamed: 0,month,year,aboard,fatalities,avg
count,63.0,63.0,63.0,63.0,63.0
mean,7.555556,1987.68254,222.460317,215.396825,0.975045
std,3.1458,10.896619,85.194431,80.000109,0.079618
min,1.0,1968.0,154.0,154.0,0.469697
25%,5.5,1979.0,167.0,164.5,0.996951
50%,8.0,1987.0,191.0,183.0,1.0
75%,10.0,1996.0,258.0,245.0,1.0
max,12.0,2009.0,644.0,583.0,1.0
