In [1]:
#required imports

import os
import sys
import numpy as np
import pandas as pd

from numpy import loadtxt

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from combo.models.classifier_comb import SimpleClassifierAggregator
from combo.utils.data import evaluate_print

import warnings

warnings.filterwarnings("ignore")

In [2]:
#NOTE: I am using a 'test' version of the dataset
#as trying to use the dataset as it was was giving me many errors trying to trim out data that was
#not needed for the prediction, namely the columns for description and correction of problems
#as well as the columns containing dates. In addition to this, the data is slightly modified from the original version.
#The only modification made was all elements that were empty were instead replaced by an element that reads '0',and all that
#said "Yes" changed to "1", to prevent necessary elements from being removed simply for being recognized as empty

#import dataset as dataframe
df = pd.read_csv('HtM_MAF Data_Final_Test.csv')
#print the shape as (row, col)
print(df.shape)
#print a summary of each column of dataset. it is necesasry to have include = all to show all columns to reader
df.describe(include = 'all').transpose()

(362281, 16)


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Job Code,362281,362281.0,3U21BEHPD4105599,1.0,,,,,,,
Aircraft,362281,,,,22.795,13.0312,1.0,12.0,24.0,34.0,45.0
Transaction Code,362281,,,,12.617,5.95366,11.0,11.0,11.0,11.0,47.0
Malfunction Code,362281,227.0,0,202916.0,,,,,,,
Action Taken Code,362281,14.0,0,193362.0,,,,,,,
Description of Problem,362281,1040.0,Perform a periodic inspection,144394.0,,,,,,,
Correction of Problem,362281,24.0,Completed the inspection,151702.0,,,,,,,
Received Date,362281,2664.0,5/21/2017,552.0,,,,,,,
Completion Date,362281,2739.0,2/21/2019,448.0,,,,,,,
Corrosion,362281,,,,0.0507258,0.219438,0.0,0.0,0.0,0.0,1.0


In [3]:
#create new dataframe that is trimmed version of original.
#drop will remove unwanted columns from dataframe. axis = 1 defines columns axis
df1 = df.drop(df.columns[[0,1,2,3,4,5,6,7,8]], axis = 1)
#print the shape of the new dataset
print(df1.shape)
df1.describe(include = 'all').transpose()

(362281, 7)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Corrosion,362281.0,0.050726,0.219438,0.0,0.0,0.0,0.0,1.0
Bare Metal,362281.0,0.040422,0.196946,0.0,0.0,0.0,0.0,1.0
Corrosion Prevention Treatment,362281.0,0.027145,0.162505,0.0,0.0,0.0,0.0,1.0
Routine Maintenance,362281.0,0.533246,0.498894,0.0,0.0,1.0,1.0,1.0
Unscheduled Maintenance,362281.0,0.308493,0.461872,0.0,0.0,0.0,1.0,1.0
Mission-Related Maintenance,362281.0,0.045583,0.20858,0.0,0.0,0.0,0.0,1.0
Failure,362281.0,0.023462,0.151367,0.0,0.0,0.0,0.0,1.0


In [4]:
#create numpy array for value we want to predict
labels = np.array(df1['Failure'])
#remove label we want to predict
df1 = df1.drop('Failure', axis = 1)
#save column heads for use later on
column_list = list(df1.columns)
#convert entire dataset to numpy array
df1 = np.array(df1)

In [5]:
#split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(df1, labels, test_size = 0.25, random_state = 42)

#print the shape of each training and testing set
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (271710, 6)
Training Labels Shape: (271710,)
Testing Features Shape: (90571, 6)
Testing Labels Shape: (90571,)


In [6]:
#make a model
classifier = RandomForestClassifier(n_estimators = 20, random_state = 42)
classifier.fit(train_features, train_labels)
label_predict = classifier.predict(test_features)

In [7]:
#print accuracy data
print(confusion_matrix(test_labels, label_predict))
print(classification_report(test_labels, label_predict))
print(accuracy_score(test_labels, label_predict))

[[88397     0]
 [  811  1363]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     88397
           1       1.00      0.63      0.77      2174

    accuracy                           0.99     90571
   macro avg       1.00      0.81      0.88     90571
weighted avg       0.99      0.99      0.99     90571

0.9910456989544114


In [8]:
##The above has all been for making a simple Random Forest Classifier Model. From here,
#the models will use combination techniques using Classification as the Combination standard.

random_state = 42

#fit and predict data for decision tree
clf = DecisionTreeClassifier(random_state = random_state)
clf.fit(train_features, train_labels)
evaluate_print('Decision Tree        |', test_labels, clf.predict(test_features))

#fit and predict data for logistic regression
clf = LogisticRegression(random_state = random_state)
clf.fit(train_features, train_labels)
evaluate_print('Logistic Regression  |', test_labels, clf.predict(test_features))

#fit and predict for KNeighbor
clf = KNeighborsClassifier()
clf.fit(train_features, train_labels)
evaluate_print('K Neighbors          |', test_labels, clf.predict(test_features))

#fit and predict for GradientBoosting
clf = GradientBoostingClassifier(random_state = random_state)
clf.fit(train_features, train_labels)
evaluate_print('Gradient Boosting    |', test_labels, clf.predict(test_features))

#another Random Forest for posterity sake (also because of combo example)
clf = RandomForestClassifier(random_state = random_state)
clf.fit(train_features, train_labels)
evaluate_print('Random Forest        |', test_labels, clf.predict(test_features))

print()

#initialize classifiers
classifiers = [DecisionTreeClassifier(random_state = random_state),
               LogisticRegression(random_state = random_state),
               KNeighborsClassifier(),
               RandomForestClassifier(random_state = random_state),
               GradientBoostingClassifier(random_state = random_state)]

#do the combo stuff
#combine by average
clf = SimpleClassifierAggregator(classifiers, method = 'average')
clf.fit(train_features, train_labels)
label_predict = clf.predict(test_features)
evaluate_print('Combination by avg   |', test_labels, clf.predict(test_features))

#combine by weighted average
#define weights first
clf_weights = np.array([0.1,0.4,0.1,0.2,0.2])
clf = SimpleClassifierAggregator(classifiers, method = 'average', weights = clf_weights)
clf.fit(train_features, train_labels)
label_predict = clf.predict(test_features)
evaluate_print('Combination by w_avg |', test_labels, clf.predict(test_features))

#combine by maximization
clf = SimpleClassifierAggregator(classifiers, method = 'maximization')
clf.fit(train_features, train_labels)
label_predict = clf.predict(test_features)
evaluate_print('Combination by max   |', test_labels, label_predict)

#combine by weighted majority
#reusing weights defined earlier
clf = SimpleClassifierAggregator(classifiers, method = 'majority_vote', weights = clf_weights)
clf.fit(train_features, train_labels)
label_predict = clf.predict(test_features)
evaluate_print('Combination by w_vote|', test_labels, clf.predict(test_features))

#combine by median
clf = SimpleClassifierAggregator(classifiers, method = 'median')
clf.fit(train_features, train_labels)
label_predict = clf.predict(test_features)
evaluate_print('Combination by median|', test_labels, label_predict)



Decision Tree        | Accuracy:0.991, ROC:0.8135, F1:0.7707
Logistic Regression  | Accuracy:0.991, ROC:0.8135, F1:0.7707
K Neighbors          | Accuracy:0.991, ROC:0.8135, F1:0.7707
Gradient Boosting    | Accuracy:0.991, ROC:0.8135, F1:0.7707
Random Forest        | Accuracy:0.991, ROC:0.8135, F1:0.7707

Combination by avg   | Accuracy:0.991, ROC:0.8135, F1:0.7707
[[0.5 2.  0.5 1.  1. ]]
Combination by w_avg | Accuracy:0.991, ROC:0.8135, F1:0.7707
Combination by max   | Accuracy:0.991, ROC:0.8135, F1:0.7707
[[0.5 2.  0.5 1.  1. ]]
Combination by w_vote| Accuracy:0.991, ROC:0.8135, F1:0.7707
Combination by median| Accuracy:0.991, ROC:0.8135, F1:0.7707
