## Using categorical values to predict Aircraft Damage

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
aviation = pd.read_csv('../Data/AviationFinal.csv')
aviation

Unnamed: 0,EventID,WeatherCondition,BroadPhaseOfFlight,AircraftDamage,InvestigationType,PurposeOfFlight,EngineType,TotalFatalInjuries,TotalSeriousInjuries,TotalMinorInjuries,TotalUninjured,TotalInjuries,EventDate,Month,Year
0,20161117X64217,UNK,MANEUVERING,Substantial,Accident,Unknown,Turbo Shaft,0.0,0.0,0.0,1.0,0.0,2016-11-14,11,2016
1,20161116X13203,VMC,LANDING,Substantial,Accident,Unknown,Unknown,0.0,0.0,0.0,2.0,0.0,2016-11-14,11,2016
2,20161116X62135,VMC,TAKEOFF,Substantial,Accident,Personal,Reciprocating,0.0,0.0,0.0,1.0,0.0,2016-11-13,11,2016
3,20161114X41943,VMC,MANEUVERING,Substantial,Accident,Instructional,Reciprocating,0.0,0.0,0.0,2.0,0.0,2016-12-11,12,2016
4,20161116X04207,VMC,UNKNOWN,Substantial,Accident,Personal,Reciprocating,0.0,1.0,0.0,1.0,1.0,2016-12-11,12,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79136,20041105X01764,VMC,APPROACH,Destroyed,Accident,Personal,Unknown,1.0,2.0,0.0,0.0,3.0,1979-02-08,2,1979
79137,20001218X45448,IMC,CRUISE,Destroyed,Accident,Personal,Reciprocating,2.0,0.0,0.0,0.0,2.0,1977-06-19,6,1977
79138,20061025X01555,IMC,CRUISE,Destroyed,Accident,Personal,Reciprocating,3.0,0.0,0.0,0.0,3.0,1974-08-30,8,1974
79139,20001218X45447,UNK,UNKNOWN,Destroyed,Accident,Personal,Reciprocating,4.0,0.0,0.0,0.0,4.0,1962-07-19,7,1962


### Splitting data

In [3]:
pd.set_option('display.max_columns', None)  # Unlimited columns.
pd.options.mode.use_inf_as_na = True  

aircraftDamage = pd.DataFrame(aviation['AircraftDamage'])
aviationcategorical = pd.DataFrame(aviation[['Month', 'EngineType', 'WeatherCondition', "BroadPhaseOfFlight", "PurposeOfFlight"]])


X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(aviationcategorical, aircraftDamage, test_size = 0.2)
y_train_original.head()

Unnamed: 0,AircraftDamage
25651,Substantial
33597,Substantial
14919,Substantial
28349,Substantial
5794,Substantial


### See how many categories AircraftDamage has
> More than half is just one category - Substantial

In [4]:
pd.value_counts(y_train_original['AircraftDamage'], normalize = True)

Substantial    0.719548
Destroyed      0.218300
Minor          0.031590
Unknown        0.030563
Name: AircraftDamage, dtype: float64

### Majority class prediction
>It is to gauge how good our future prediction scores should be. Gives baseline
that we want to cross with our next model

In [5]:
y_pred = ['functional'] * len(X_test_original)
y_pred = pd.DataFrame(data = y_pred,
                      index = X_test_original.index.values,
                      columns = ['AircraftDamage'])
y_pred.head()

Unnamed: 0,AircraftDamage
3561,functional
27193,functional
56613,functional
21574,functional
51906,functional


### Predict with just the numerical features

In [6]:
X_train_numerical = X_train_original.select_dtypes(
                         include = np.number).copy()

X_train_numerical.head()

Unnamed: 0,Month
25651,9
33597,1
14919,3
28349,6
5794,2


In [7]:
X_train_numerical_indices = X_train_numerical.index.values
y_train_numerical = y_train_original[y_train_original.index.isin(X_train_numerical_indices)]

### Logistic Regression

In [8]:
cv_score = cross_val_score(LogisticRegression(), 
                            X_train_numerical, y_train_numerical,
                            scoring = 'accuracy',
                            cv = 3,
                            n_jobs = -1,
                            verbose = 1)
cv_score

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.5s finished


array([0.71954513, 0.71953184, 0.71956594])

### Decision Tree

In [9]:
clf = DecisionTreeClassifier()
cv_score = cross_val_score(clf, 
                            X_train_numerical, y_train_numerical,
                            scoring = 'accuracy',
                            cv = 3,
                            n_jobs = -1,
                            verbose = 1)
cv_score

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.0s finished


array([0.71954513, 0.71953184, 0.71956594])

### Decision Tree Classifier

In [10]:
X_train_original.isnull().sum()
X_non_nulls = X_train_original.dropna(axis = 1)
X_non_nulls.nunique().sort_values(ascending = True)

X_selected = X_non_nulls.loc[:, X_non_nulls.nunique().sort_values()< 50]
cat_cols = list(X_selected.select_dtypes(['object']).columns.values)
X_categorical = X_selected[cat_cols].apply(lambda x: x.astype('category').cat.codes)
X_train_selected = X_train_numerical.join(X_categorical)
clf = DecisionTreeClassifier()
cv_score = cross_val_score(clf, 
                            X_train_selected, y_train_original,
                            scoring = 'accuracy',
                            cv = 3,
                            n_jobs = -1,
                            verbose = 1)
cv_score

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.6s finished


array([0.73669747, 0.74128127, 0.73913662])

### Random Forest Classifier

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train_selected, y_train_original, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

  """


array([0.16221942, 0.22061433, 0.24641926, 0.37074699])

#### Search for the best Random Forest Classifier using a grid search

In [12]:
param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [6, 10, 20, 30]
}
gridsearch = GridSearchCV(RandomForestClassifier(n_jobs = -1), 
                          param_grid=param_grid, 
                          scoring='accuracy', cv=3, 
                          return_train_score=True, verbose=10)
grid_result = gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_depth=6, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=10, score=(train=0.747, test=0.746), total=   0.4s
[CV] max_depth=6, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=10, score=(train=0.749, test=0.744), total=   0.3s
[CV] max_depth=6, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=10, score=(train=0.747, test=0.746), total=   0.3s
[CV] max_depth=6, n_estimators=20 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.3s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=20, score=(train=0.748, test=0.746), total=   0.3s
[CV] max_depth=6, n_estimators=20 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.8s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=20, score=(train=0.748, test=0.744), total=   0.4s
[CV] max_depth=6, n_estimators=20 ....................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.3s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=20, score=(train=0.747, test=0.746), total=   0.4s
[CV] max_depth=6, n_estimators=30 ....................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.8s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=30, score=(train=0.748, test=0.747), total=   0.5s
[CV] max_depth=6, n_estimators=30 ....................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    3.5s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=30, score=(train=0.749, test=0.744), total=   0.4s
[CV] max_depth=6, n_estimators=30 ....................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    4.1s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=30, score=(train=0.748, test=0.746), total=   0.4s
[CV] max_depth=10, n_estimators=10 ...................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    4.7s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=10, score=(train=0.758, test=0.743), total=   0.3s
[CV] max_depth=10, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=10, score=(train=0.758, test=0.741), total=   0.3s
[CV] max_depth=10, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=10, score=(train=0.757, test=0.745), total=   0.3s
[CV] max_depth=10, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=20, score=(train=0.759, test=0.742), total=   0.4s
[CV] max_depth=10, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=20, score=(train=0.758, test=0.743), total=   0.4s
[CV] max_depth=10, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=20, score=(train=0.758, test=0.744), total=   0.4s
[CV] max_depth=10, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=30, score=(train=0.759, test=0.742), total=   0.5s
[CV] max_depth=10, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=30, score=(train=0.758, test=0.743), total=   0.6s
[CV] max_depth=10, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=30, score=(train=0.758, test=0.745), total=   0.6s
[CV] max_depth=20, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=10, score=(train=0.763, test=0.737), total=   0.3s
[CV] max_depth=20, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=10, score=(train=0.762, test=0.738), total=   0.3s
[CV] max_depth=20, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=10, score=(train=0.762, test=0.738), total=   0.3s
[CV] max_depth=20, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=20, score=(train=0.763, test=0.739), total=   0.4s
[CV] max_depth=20, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=20, score=(train=0.762, test=0.738), total=   0.4s
[CV] max_depth=20, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=20, score=(train=0.763, test=0.739), total=   0.4s
[CV] max_depth=20, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=30, score=(train=0.764, test=0.739), total=   0.6s
[CV] max_depth=20, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=30, score=(train=0.763, test=0.738), total=   0.5s
[CV] max_depth=20, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=30, score=(train=0.762, test=0.740), total=   0.5s
[CV] max_depth=30, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=10, score=(train=0.763, test=0.737), total=   0.3s
[CV] max_depth=30, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=10, score=(train=0.762, test=0.738), total=   0.3s
[CV] max_depth=30, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=10, score=(train=0.762, test=0.738), total=   0.3s
[CV] max_depth=30, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=20, score=(train=0.764, test=0.739), total=   0.4s
[CV] max_depth=30, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=20, score=(train=0.762, test=0.739), total=   0.4s
[CV] max_depth=30, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=20, score=(train=0.762, test=0.741), total=   0.4s
[CV] max_depth=30, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=30, score=(train=0.764, test=0.739), total=   0.6s
[CV] max_depth=30, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=30, score=(train=0.763, test=0.737), total=   0.5s
[CV] max_depth=30, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=30, score=(train=0.762, test=0.741), total=   0.5s


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   20.3s finished
  self.best_estimator_.fit(X, y, **fit_params)


{'max_depth': 6, 'n_estimators': 30}


### Shows Scores of different parameters
> Highest mean test score is 0.744

In [13]:
pd.DataFrame(gridsearch.cv_results_).sort_values( \
                                         by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
2,0.327124,0.019151,0.128325,0.003389,6,30,"{'max_depth': 6, 'n_estimators': 30}",0.746817,0.743944,0.746461,0.74574,0.001279,1,0.747631,0.748712,0.747631,0.747991,0.00051
0,0.162899,0.049896,0.131979,0.007566,6,10,"{'max_depth': 6, 'n_estimators': 10}",0.746224,0.744299,0.745987,0.745503,0.000857,2,0.746712,0.748564,0.746742,0.74734,0.000866
1,0.21476,0.017958,0.130984,0.002486,6,20,"{'max_depth': 6, 'n_estimators': 20}",0.745751,0.744299,0.745809,0.745286,0.000698,3,0.74769,0.748208,0.746891,0.747596,0.000542
5,0.43517,0.042347,0.130983,0.004701,10,30,"{'max_depth': 10, 'n_estimators': 30}",0.742316,0.743292,0.745157,0.743588,0.001179,4,0.7593,0.758396,0.758085,0.758593,0.000515
4,0.27327,0.004887,0.131314,0.00188,10,20,"{'max_depth': 10, 'n_estimators': 20}",0.74202,0.743055,0.744446,0.743174,0.000994,5,0.7593,0.758337,0.758055,0.758564,0.000533
3,0.13198,0.005722,0.13198,0.005299,10,10,"{'max_depth': 10, 'n_estimators': 10}",0.743382,0.740567,0.744624,0.742858,0.001697,6,0.757849,0.757893,0.757492,0.757744,0.000179
10,0.254984,0.010597,0.130318,0.000469,30,20,"{'max_depth': 30, 'n_estimators': 20}",0.738525,0.738909,0.741425,0.73962,0.001286,7,0.763683,0.762365,0.762467,0.762838,0.000599
8,0.412232,0.014666,0.133975,0.00418,20,30,"{'max_depth': 20, 'n_estimators': 30}",0.738821,0.738435,0.739648,0.738968,0.000506,8,0.763772,0.762542,0.762497,0.762937,0.000591
11,0.421871,0.016225,0.13165,0.001408,30,30,"{'max_depth': 30, 'n_estimators': 30}",0.738821,0.73731,0.740655,0.738929,0.001368,9,0.763713,0.762631,0.762467,0.762937,0.000553
7,0.263628,0.01324,0.132323,0.004024,20,20,"{'max_depth': 20, 'n_estimators': 20}",0.738762,0.737665,0.739293,0.738573,0.000678,10,0.763417,0.762394,0.762527,0.762779,0.000454


In [16]:
print (gridsearch.best_params_)
importances = gridsearch.feature_importances_
importances

{'max_depth': 6, 'n_estimators': 30}


AttributeError: 'GridSearchCV' object has no attribute 'feature_importances_'

In [15]:
#import train_test_split function
from sklearn.model_selection import train_test_split

aviation = pd.read_csv('../Data/AviationFinal.csv')


x = aviation[['year','Life Ladder', 'Log GDP per capita','Social support','Healthy life expectancy at birth','Freedom to make life choices','Generosity','Perceptions of corruption','Positive affect','Negative affect','Confidence in national government','Democratic Quality','Delivery Quality','Standard deviation of ladder by country-year','Standard deviation/Mean of ladder by country-year','GINI index (World Bank estimate), average 2000-15']]
y = aviation['AircraftDamage'] #labels

#split dataset into training set and test set
#split 70% training and 30% testing
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3)

KeyError: "None of [Index(['year', 'Life Ladder', 'Log GDP per capita', 'Social support',\n       'Healthy life expectancy at birth', 'Freedom to make life choices',\n       'Generosity', 'Perceptions of corruption', 'Positive affect',\n       'Negative affect', 'Confidence in national government',\n       'Democratic Quality', 'Delivery Quality',\n       'Standard deviation of ladder by country-year',\n       'Standard deviation/Mean of ladder by country-year',\n       'GINI index (World Bank estimate), average 2000-15'],\n      dtype='object')] are in the [columns]"