# Wrangling

In [1]:
# deps
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
# I just cloned the full repository, 
  # for about 100 mb I get a ton of data-sets to play with
!git clone https://github.com/fivethirtyeight/data

fatal: destination path 'data' already exists and is not an empty directory.


In [3]:
df = pd.read_csv("data/airline-safety/airline-safety.csv")

In [4]:
df.head(5)

Unnamed: 0,airline,avail_seat_km_per_week,incidents_85_99,fatal_accidents_85_99,fatalities_85_99,incidents_00_14,fatal_accidents_00_14,fatalities_00_14
0,Aer Lingus,320906734,2,0,0,0,0,0
1,Aeroflot*,1197672318,76,14,128,6,1,88
2,Aerolineas Argentinas,385803648,6,0,0,1,0,0
3,Aeromexico*,596871813,3,1,64,5,0,0
4,Air Canada,1865253802,2,0,0,2,0,0


# Preprocessing

In [5]:
df.isnull().sum()

airline                   0
avail_seat_km_per_week    0
incidents_85_99           0
fatal_accidents_85_99     0
fatalities_85_99          0
incidents_00_14           0
fatal_accidents_00_14     0
fatalities_00_14          0
dtype: int64

# Analysis/Processing

In [6]:
len(df)

56

In [7]:
[5, 10] + [5, 10]

[5, 10, 5, 10]

In [8]:
len(df) * .75 * .75

31.5

In [9]:
len(df) * .75 * .25

10.5

In [10]:
len(df) * .25

14.0

In [11]:
features = ["incidents", "fatal_accidents", "fatalities"]

In [12]:
z = [list(df[feature + "_85_99"]) + list(df[feature + "_00_14"])for feature in features]

In [13]:
len(z)

3

In [14]:
len(z[1])

112

In [15]:
features = pd.DataFrame({"incidents" : z[0], "fatal_accidents" : z[1], 
                   "fatalities" : z[2]})

In [16]:
len(features)

112

In [17]:
def yearcategory(x):
    z = []
    for count, i in enumerate(x):
        if count > len(df):
            z.append(2014)
        else:
            z.append(1985)
    return(z)

In [18]:
years = yearcategory(features["fatalities"])

In [19]:
print(len(years))

112


In [20]:
features["years"] = years

In [21]:
features.head()

Unnamed: 0,incidents,fatal_accidents,fatalities,years
0,2,0,0,1985
1,76,14,128,1985
2,6,0,0,1985
3,3,1,64,1985
4,2,0,0,1985


In [22]:
target = 'years'
train, test = train_test_split(features)
trainX = train.drop(target, axis = 1)
trainy = train[target]
testX = test.drop(target, axis = 1)
testy = test[target]

In [23]:
trainX

Unnamed: 0,incidents,fatal_accidents,fatalities
77,1,0,0
50,8,3,64
12,1,0,0
29,2,0,0
94,3,0,0
...,...,...,...
102,1,1,3
16,12,6,535
48,0,0,0
101,3,0,0


In [24]:
trainy

77     2014
50     1985
12     1985
29     1985
94     2014
       ... 
102    2014
16     1985
48     1985
101    2014
44     1985
Name: years, Length: 84, dtype: int64

# Modeling

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
model = LogisticRegression()

In [27]:
model.fit(trainX, trainy)

LogisticRegression()

In [28]:
yhat = model.predict(testX)

In [29]:
from sklearn.metrics import accuracy_score

In [30]:
accuracy_score(testy, yhat)

0.4642857142857143

# The Real Analysis

In [31]:
features

Unnamed: 0,incidents,fatal_accidents,fatalities,years
0,2,0,0,1985
1,76,14,128,1985
2,6,0,0,1985
3,3,1,64,1985
4,2,0,0,1985
...,...,...,...,...
107,14,2,109,2014
108,11,2,23,2014
109,1,0,0,2014
110,0,0,0,2014


In [32]:
features["fatal_accidents"].describe()

count    112.000000
mean       1.419643
std        2.236625
min        0.000000
25%        0.000000
50%        1.000000
75%        2.000000
max       14.000000
Name: fatal_accidents, dtype: float64

In [33]:
features["incidents"].describe()

count    112.000000
mean       5.651786
std        8.540006
min        0.000000
25%        1.750000
50%        3.000000
75%        7.000000
max       76.000000
Name: incidents, dtype: float64

# The Real Processing

In [34]:
from scipy.stats import ttest_ind

In [35]:
mask = [val > np.mean(features["fatal_accidents"]) for val in features["fatal_accidents"]]

In [36]:
mask

[False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 False]

In [37]:
onlyabovemu = features[mask]

In [38]:
onlyabovemu

Unnamed: 0,incidents,fatal_accidents,fatalities,years
1,76,14,128,1985
5,14,4,79,1985
9,7,2,50,1985
11,21,5,101,1985
13,5,3,323,1985
16,12,6,535,1985
19,24,12,407,1985
20,8,3,282,1985
22,25,5,167,1985
24,10,3,260,1985


In [39]:
len(onlyabovemu)

35

In [40]:
ttest_ind(features["years"], onlyabovemu["years"])

Ttest_indResult(statistic=1.538691906112917, pvalue=0.12605892096661567)

In [41]:
mask = [val > np.mean(features["incidents"]) for val in features["incidents"]]

In [42]:
onlyabovemu = features[mask]

In [43]:
ttest_ind(features["years"], onlyabovemu["years"])

Ttest_indResult(statistic=1.1895326000264201, pvalue=0.23614792418482572)

In [44]:
features

Unnamed: 0,incidents,fatal_accidents,fatalities,years
0,2,0,0,1985
1,76,14,128,1985
2,6,0,0,1985
3,3,1,64,1985
4,2,0,0,1985
...,...,...,...,...
107,14,2,109,2014
108,11,2,23,2014
109,1,0,0,2014
110,0,0,0,2014


# Feature Selection

In [45]:
from sklearn.preprocessing import quantile_transform

In [46]:
features = features.drop("fatalities", axis = 1)

In [47]:
from scipy import stats

In [48]:
z_scores = stats.zscore(features)

In [49]:
z_scores

array([[-0.4295311 , -0.63757805, -0.98229949],
       [ 8.2745123 ,  5.64998407, -0.98229949],
       [ 0.04095773, -0.63757805, -0.98229949],
       [-0.31190889, -0.18846647, -0.98229949],
       [-0.4295311 , -0.63757805, -0.98229949],
       [ 0.9819354 ,  1.15886827, -0.98229949],
       [-0.4295311 , -0.18846647, -0.98229949],
       [-0.31190889, -0.63757805, -0.98229949],
       [-0.07666447, -0.63757805, -0.98229949],
       [ 0.15857994,  0.26064511, -0.98229949],
       [-0.31190889, -0.18846647, -0.98229949],
       [ 1.80529086,  1.60797985, -0.98229949],
       [-0.54715331, -0.63757805, -0.98229949],
       [-0.07666447,  0.70975669, -0.98229949],
       [-0.19428668, -0.63757805, -0.98229949],
       [-0.66477552, -0.63757805, -0.98229949],
       [ 0.74669098,  2.05709143, -0.98229949],
       [-0.4295311 , -0.18846647, -0.98229949],
       [-0.31190889, -0.18846647, -0.98229949],
       [ 2.15815748,  4.75176091, -0.98229949],
       [ 0.27620215,  0.70975669, -0.982

In [55]:
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
features = features[filtered_entries]

ValueError: Item wrong length 112 instead of 110.

In [65]:
target = 'years'
train, test = train_test_split(features)
trainX = train.drop(target, axis = 1)
trainy = train[target]
testX = test.drop(target, axis = 1)
testy = test[target]

In [66]:
model.fit(trainX, trainy)

LogisticRegression()

In [67]:
yhat = model.predict(testX)

In [68]:
accuracy_score(testy, yhat)

0.7857142857142857

# Better Model