## Using categorical values to predict Aircraft Damage

In [18]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [19]:
aviation = pd.read_csv('../Data/AviationFinal.csv')
aviation

Unnamed: 0,WeatherCondition,BroadPhaseOfFlight,AircraftDamage,InvestigationType,PurposeOfFlight,EngineType,TotalFatalInjuries,TotalSeriousInjuries,TotalMinorInjuries,TotalUninjured,TotalInjuries,EventDate,Month,Year
0,UNK,MANEUVERING,Substantial,Accident,Unknown,Turbo Shaft,0.0,0.0,0.0,1.0,0.0,2016-11-14,11,2016
1,VMC,LANDING,Substantial,Accident,Unknown,Unknown,0.0,0.0,0.0,2.0,0.0,2016-11-14,11,2016
2,VMC,TAKEOFF,Substantial,Accident,Personal,Reciprocating,0.0,0.0,0.0,1.0,0.0,2016-11-13,11,2016
3,VMC,MANEUVERING,Substantial,Accident,Instructional,Reciprocating,0.0,0.0,0.0,2.0,0.0,2016-12-11,12,2016
4,VMC,UNKNOWN,Substantial,Accident,Personal,Reciprocating,0.0,1.0,0.0,1.0,1.0,2016-12-11,12,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79136,VMC,APPROACH,Destroyed,Accident,Personal,Unknown,1.0,2.0,0.0,0.0,3.0,1979-02-08,2,1979
79137,IMC,CRUISE,Destroyed,Accident,Personal,Reciprocating,2.0,0.0,0.0,0.0,2.0,1977-06-19,6,1977
79138,IMC,CRUISE,Destroyed,Accident,Personal,Reciprocating,3.0,0.0,0.0,0.0,3.0,1974-08-30,8,1974
79139,UNK,UNKNOWN,Destroyed,Accident,Personal,Reciprocating,4.0,0.0,0.0,0.0,4.0,1962-07-19,7,1962


### Splitting data

In [20]:
pd.set_option('display.max_columns', None)  # Unlimited columns.
pd.options.mode.use_inf_as_na = True  

aircraftDamage = pd.DataFrame(aviation['AircraftDamage'])
aviationcategorical = pd.DataFrame(aviation[['Month', 'EngineType', 'WeatherCondition', "BroadPhaseOfFlight"]])


X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(aviationcategorical, aircraftDamage, test_size = 0.2)
y_train_original.head()

Unnamed: 0,AircraftDamage
55083,Substantial
34590,Minor
33892,Substantial
44819,Substantial
46003,Destroyed


### See how many categories AircraftDamage has
> More than half is just one category - Substantial

In [21]:
pd.value_counts(y_train_original['AircraftDamage'], normalize = True)

Substantial    0.719169
Destroyed      0.218711
Minor          0.031969
Unknown        0.030152
Name: AircraftDamage, dtype: float64

### Majority class prediction
>It is to gauge how good our future prediction scores should be. Gives baseline
that we want to cross with our next model

In [22]:
y_pred = ['functional'] * len(X_test_original)
y_pred = pd.DataFrame(data = y_pred,
                      index = X_test_original.index.values,
                      columns = ['AircraftDamage'])
y_pred.head()

Unnamed: 0,AircraftDamage
31801,functional
7739,functional
53138,functional
24482,functional
73841,functional


### Predict with just the numerical features

In [23]:
X_train_numerical = X_train_original.select_dtypes(
                         include = np.number).copy()

X_train_numerical.head()

Unnamed: 0,Month
55083,9
34590,7
33892,11
44819,12
46003,1


In [24]:
X_train_numerical_indices = X_train_numerical.index.values
y_train_numerical = y_train_original[y_train_original.index.isin(X_train_numerical_indices)]

### Logistic Regression

In [25]:
cv_score = cross_val_score(LogisticRegression(), 
                            X_train_numerical, y_train_numerical,
                            scoring = 'accuracy',
                            cv = 3,
                            n_jobs = -1,
                            verbose = 1)
cv_score

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.9s finished


array([0.719132  , 0.71915277, 0.71922093])

### Decision Tree

In [26]:
clf = DecisionTreeClassifier()
cv_score = cross_val_score(clf, 
                            X_train_numerical, y_train_numerical,
                            scoring = 'accuracy',
                            cv = 3,
                            n_jobs = -1,
                            verbose = 1)
cv_score

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


array([0.719132  , 0.71915277, 0.71922093])

### Decision Tree Classifier

In [28]:
X_train_original.isnull().sum()
X_non_nulls = X_train_original.dropna(axis = 1)
X_non_nulls.nunique().sort_values(ascending = True)

X_selected = X_non_nulls.loc[:, X_non_nulls.nunique().sort_values()< 50]
cat_cols = list(X_selected.select_dtypes(['object']).columns.values)
X_categorical = X_selected[cat_cols].apply(lambda x: x.astype('category').cat.codes)
X_train_selected = X_train_numerical.join(X_categorical)
clf = DecisionTreeClassifier()
cv_score = cross_val_score(clf, 
                            X_train_selected, y_train_original,
                            scoring = 'accuracy',
                            cv = 3,
                            n_jobs = -1,
                            verbose = 1)
cv_score

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


array([0.74059509, 0.73763268, 0.7372287 ])

### Random Forest Classifier

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train_selected, y_train_original, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

  """


0.7437416094132512

#### Search for the best Random Forest Classifier using a grid search

In [30]:
param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [6, 10, 20, 30]
}
gridsearch = GridSearchCV(RandomForestClassifier(n_jobs = -1), 
                          param_grid=param_grid, 
                          scoring='accuracy', cv=3, 
                          return_train_score=True, verbose=10)
gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_depth=6, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=10, score=(train=0.745, test=0.744), total=   0.3s
[CV] max_depth=6, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=10, score=(train=0.748, test=0.742), total=   0.2s
[CV] max_depth=6, n_estimators=10 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=10, score=(train=0.746, test=0.743), total=   0.2s
[CV] max_depth=6, n_estimators=20 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.1s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=20, score=(train=0.745, test=0.746), total=   0.3s
[CV] max_depth=6, n_estimators=20 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.5s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=20, score=(train=0.748, test=0.742), total=   0.3s
[CV] max_depth=6, n_estimators=20 ....................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.0s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=20, score=(train=0.747, test=0.744), total=   0.3s
[CV] max_depth=6, n_estimators=30 ....................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.5s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=30, score=(train=0.745, test=0.746), total=   0.4s
[CV] max_depth=6, n_estimators=30 ....................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    3.0s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=30, score=(train=0.747, test=0.741), total=   0.4s
[CV] max_depth=6, n_estimators=30 ....................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    3.6s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=6, n_estimators=30, score=(train=0.747, test=0.744), total=   0.4s
[CV] max_depth=10, n_estimators=10 ...................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    4.2s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=10, score=(train=0.756, test=0.743), total=   0.2s
[CV] max_depth=10, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=10, score=(train=0.758, test=0.740), total=   0.3s
[CV] max_depth=10, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=10, score=(train=0.756, test=0.744), total=   0.3s
[CV] max_depth=10, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=20, score=(train=0.756, test=0.743), total=   0.3s
[CV] max_depth=10, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=20, score=(train=0.758, test=0.740), total=   0.3s
[CV] max_depth=10, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=20, score=(train=0.757, test=0.743), total=   0.4s
[CV] max_depth=10, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=30, score=(train=0.757, test=0.744), total=   0.5s
[CV] max_depth=10, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=30, score=(train=0.759, test=0.741), total=   0.5s
[CV] max_depth=10, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=10, n_estimators=30, score=(train=0.757, test=0.743), total=   0.5s
[CV] max_depth=20, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=10, score=(train=0.760, test=0.738), total=   0.2s
[CV] max_depth=20, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=10, score=(train=0.762, test=0.736), total=   0.3s
[CV] max_depth=20, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=10, score=(train=0.760, test=0.740), total=   0.2s
[CV] max_depth=20, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=20, score=(train=0.760, test=0.739), total=   0.4s
[CV] max_depth=20, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=20, score=(train=0.763, test=0.737), total=   0.4s
[CV] max_depth=20, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=20, score=(train=0.761, test=0.741), total=   0.4s
[CV] max_depth=20, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=30, score=(train=0.760, test=0.739), total=   0.5s
[CV] max_depth=20, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=30, score=(train=0.763, test=0.736), total=   0.6s
[CV] max_depth=20, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=20, n_estimators=30, score=(train=0.761, test=0.740), total=   0.5s
[CV] max_depth=30, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=10, score=(train=0.760, test=0.740), total=   0.2s
[CV] max_depth=30, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=10, score=(train=0.762, test=0.736), total=   0.3s
[CV] max_depth=30, n_estimators=10 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=10, score=(train=0.760, test=0.740), total=   0.2s
[CV] max_depth=30, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=20, score=(train=0.760, test=0.740), total=   0.4s
[CV] max_depth=30, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=20, score=(train=0.763, test=0.736), total=   0.4s
[CV] max_depth=30, n_estimators=20 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=20, score=(train=0.761, test=0.739), total=   0.4s
[CV] max_depth=30, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=30, score=(train=0.760, test=0.739), total=   0.5s
[CV] max_depth=30, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=30, score=(train=0.763, test=0.737), total=   0.5s
[CV] max_depth=30, n_estimators=30 ...................................


  estimator.fit(X_train, y_train, **fit_params)


[CV]  max_depth=30, n_estimators=30, score=(train=0.761, test=0.740), total=   0.5s


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   18.5s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='

### Shows Scores of different parameters
> Highest mean test score is 0.744

In [31]:
pd.DataFrame(gridsearch.cv_results_).sort_values( \
                                         by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
1,0.192796,0.004799,0.119779,0.000767,6,20,"{'max_depth': 6, 'n_estimators': 20}",0.745795,0.742048,0.744165,0.744003,0.001534,1,0.745387,0.747646,0.747002,0.746678,0.00095
2,0.286571,0.010118,0.134878,0.011183,6,30,"{'max_depth': 6, 'n_estimators': 30}",0.746446,0.74116,0.743751,0.743786,0.002158,2,0.745446,0.747201,0.746794,0.746481,0.00075
0,0.099402,0.008691,0.139129,0.014661,6,10,"{'max_depth': 6, 'n_estimators': 10}",0.744373,0.741574,0.743099,0.743016,0.001144,3,0.744617,0.747942,0.745935,0.746165,0.001367
5,0.351397,0.017476,0.134166,0.010476,10,30,"{'max_depth': 10, 'n_estimators': 30}",0.744433,0.741397,0.74304,0.742956,0.001241,4,0.756612,0.759284,0.757071,0.757656,0.001167
4,0.232155,0.014313,0.120309,0.000909,10,20,"{'max_depth': 10, 'n_estimators': 20}",0.74313,0.740212,0.743336,0.742226,0.001427,5,0.756079,0.758396,0.756922,0.757132,0.000958
3,0.106281,0.007229,0.138798,0.014699,10,10,"{'max_depth': 10, 'n_estimators': 10}",0.742656,0.740331,0.743632,0.742206,0.001385,6,0.755546,0.757715,0.756152,0.756471,0.000914
7,0.276261,0.026549,0.143068,0.007015,20,20,"{'max_depth': 20, 'n_estimators': 20}",0.739457,0.736836,0.740671,0.738988,0.0016,7,0.760166,0.762838,0.760802,0.761269,0.00114
11,0.366029,0.003741,0.122401,0.000774,30,30,"{'max_depth': 30, 'n_estimators': 30}",0.739457,0.736717,0.740374,0.73885,0.001554,8,0.760432,0.763046,0.760891,0.761456,0.001139
9,0.118686,0.008251,0.128979,0.009509,30,10,"{'max_depth': 30, 'n_estimators': 10}",0.739872,0.735829,0.740137,0.738613,0.001971,9,0.759692,0.762157,0.760269,0.760706,0.001053
8,0.388293,0.016678,0.139137,0.014469,20,30,"{'max_depth': 20, 'n_estimators': 30}",0.739043,0.736007,0.740019,0.738356,0.001708,10,0.760492,0.763135,0.76098,0.761535,0.001148
