In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler

In [2]:
aviation = pd.read_csv('../Data/AviationFinal.csv')

In [3]:
# Use Get dummies to encode categorical values 
weatherCondition = pd.get_dummies(aviation["WeatherCondition"], prefix_sep='_', drop_first=True)
phase = pd.get_dummies(aviation["BroadPhaseOfFlight"], prefix_sep='_', drop_first=True)
aircraftDamage = pd.get_dummies(aviation["AircraftDamage"], prefix_sep='_', drop_first=True)
investigationType = pd.get_dummies(aviation["InvestigationType"], prefix_sep='_', drop_first=True)
purposeOfFlight = pd.get_dummies(aviation["PurposeOfFlight"], prefix_sep='_', drop_first=True)
engineType = pd.get_dummies(aviation["EngineType"], prefix_sep='_', drop_first=True)

In [4]:
weatherCondition.reset_index(drop=True, inplace=True)
phase.reset_index(drop=True, inplace=True)
investigationType.reset_index(drop=True, inplace=True)
purposeOfFlight.reset_index(drop=True, inplace=True)
engineType.reset_index(drop=True, inplace=True)

train = pd.concat([weatherCondition,phase,investigationType,purposeOfFlight, engineType],axis=1)
train_y = aviation["AircraftDamage"]
X_train, X_test, y_train, y_test = train_test_split(train, train_y, test_size=0.30)

In [5]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', max_iter = 2828, multi_class="multinomial")
logreg.fit(X_train,y_train)
predictions = logreg.predict(X_test)

In [7]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

   Destroyed       0.62      0.23      0.34      5142
       Minor       0.68      0.50      0.58       792
 Substantial       0.79      0.96      0.87     17112
     Unknown       0.50      0.32      0.39       697

    accuracy                           0.77     23743
   macro avg       0.65      0.50      0.54     23743
weighted avg       0.74      0.77      0.73     23743



### Recursive Feature Elimination

In [8]:
from sklearn.feature_selection import RFE

logreg = LogisticRegression(solver='lbfgs', max_iter = 2828, multi_class="multinomial")
rfe = RFE(logreg, 20)
rfe = rfe.fit(train, train_y.values.ravel())
print(rfe.ranking_)

[18  1  4  3  1 19  1 16  1  1 21  1 20  1  9 24  1  5 22 12  1 10 11  8
  1 29  1 26 25 23  1  1  2  1  1 14  1 27 17 28  6  1 15  1  1 13  7  1]


In [13]:
cols = train.columns
print(cols)
print("Columns that are kept: ",np.where(rfe.support_ == True)[0])

# ranks = {}

# def ranking(ranks, names, order=1):
#     minmax = MinMaxScaler()
#     ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
#     ranks = map(lambda x: round(x,2), ranks)
#     return dict(zip(names, ranks))

# colnames = train.columns
# ranks["RFE"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)



# train_cleaned = np.delete(train, np.where(rfe.support_ == False)[0], axis=1)

Index(['UNK', 'VMC', 'CLIMB', 'CRUISE', 'DESCENT', 'GO-AROUND', 'LANDING',
       'MANEUVERING', 'OTHER', 'STANDING', 'TAKEOFF', 'TAXI', 'UNKNOWN',
       'Incident', 'Aerial Observation', 'Air Drop', 'Air Race/Show',
       'Banner Tow', 'Business', 'Executive/Corporate', 'External Load',
       'Ferry', 'Firefighting', 'Flight Test', 'Glider Tow', 'Instructional',
       'Other Work Use', 'Personal', 'Positioning', 'Public Aircraft',
       'Public Aircraft - Federal', 'Public Aircraft - Local',
       'Public Aircraft - State', 'Skydiving', 'Unknown', 'Hybrid Rocket',
       'None', 'REC, ELEC', 'REC, TJ, REC, TJ', 'REC, TJ, TJ', 'Reciprocating',
       'TF, TJ', 'TJ, REC, REC, TJ', 'Turbo Fan', 'Turbo Jet', 'Turbo Prop',
       'Turbo Shaft', 'Unknown'],
      dtype='object')
Columns that are kept:  [ 1  4  6  8  9 11 13 16 20 24 26 30 31 33 34 36 41 43 44 47]


> Important features: VMC, DESCENT, 

In [None]:
print(train_cleaned.columns.values)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_cleaned, train_y, test_size=0.30)

logRegv2 = LogisticRegression(solver='lbfgs', max_iter = 2828, multi_class="multinomial")
logRegv2.fit(X_train, y_train)

In [None]:
prediction = logRegv2.predict(X_test)
score = logRegv2.score(X_test, y_test)
print(score)