In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

Reading in MSP data, using ZULU time variable to create a date and time stamp

In [2]:
HTM_url = urlopen("https://www.hackthemachine.ai/s/HTM_MSP_Finalcsv.zip")

zipfile = ZipFile(BytesIO(HTM_url.read()))

msp_data = pd.read_csv(zipfile.open('HTM_MSP_Final.csv'), parse_dates=True)

msp_data.rename(columns = {'AIRCRAFT': 'Aircraft'}, inplace = True)

msp_data['Fault Date'] = pd.to_datetime(msp_data['ZULU_TIME'].str.split(" ", n = 2, expand = True)[1])

In [3]:
msp_data.head()

Unnamed: 0,Aircraft,SQUADRON,LOT,MSP,ZULU_TIME,FLIGHT_MODE,Fault Date
0,1,HH,100,ZJMZTL,01-APR-2014 00:25:39:00,EngineTurn,2014-04-01
1,1,HH,100,JDJDMQ,01-APR-2014 00:25:39:00,EngineTurn,2014-04-01
2,1,HH,100,RTVBHP,01-APR-2014 00:25:39:00,EngineTurn,2014-04-01
3,1,HH,100,RTUXHP,01-APR-2014 00:25:39:00,EngineTurn,2014-04-01
4,1,HH,100,RTUXMZ,01-APR-2014 00:25:39:00,EngineTurn,2014-04-01


Reading in MAF data, converting recevied and completed dates to datetime objects

In [4]:
HTM_url = urlopen("https://www.hackthemachine.ai/s/HtM_MAF-Data_Finalcsv.zip")

zipfile = ZipFile(BytesIO(HTM_url.read()))

maf_data = pd.read_csv(zipfile.open('HtM_MAF Data_Final.csv'), parse_dates=['Received Date', 'Completion Date'])

In [5]:
maf_data.head()

Unnamed: 0,Job Code,Aircraft,Transaction Code,Malfunction Code,Action Taken Code,Description of Problem,Correction of Problem,Received Date,Completion Date,Corrosion,Bare Metal,Corrosion Prevention Treatment,Routine Maintenance,Unscheduled Maintenance,Mission-Related Maintenance,Failure
0,0NGHY44WC8118573,42,11,0,N,Perform system or component checks,Completed the component or system test,2012-04-26,2014-02-27,,,,Yes,,,
1,0NGHY45WC8118591,42,11,0,N,Perform system or component checks,Completed the component or system test,2012-04-26,2014-02-27,,,,Yes,,,
2,0NGHY46WC8118581,42,11,0,N,Perform a periodic inspection,Completed the inspection,2012-04-26,2014-02-27,,,,Yes,,,
3,0NGHY47WC8118577,42,11,0,N,Perform a periodic inspection,Completed the inspection,2012-04-26,2014-02-27,,,,Yes,,,
4,0NGHY48WC8118586,42,11,0,N,Perform a periodic inspection,Completed the inspection,2012-04-26,2014-02-27,,,,Yes,,,


Drop rountine mainentance

In [6]:
maf_data = maf_data[maf_data['Routine Maintenance'] != 'Yes']

Adding field that indicates whether repair pertained to corrosion, grouping by aircraft and received date

In [7]:
maf_data['corr_action'] = maf_data.groupby(['Aircraft', 'Received Date'])['Corrosion'].transform(lambda x: any(x == 'Yes'))

In [11]:
maf_data[['Aircraft', 'Received Date', 'corr_action']].drop_duplicates()['corr_action'].value_counts()

False    41521
True      2625
Name: corr_action, dtype: int64

based on the value counts of the corr_action variable, we have a highly imbalanced dataset.  We will use upsampling/downsampling and adjust our evaluation metric (precison, F1) to deal with this imbalance

Adding msp frequency to repair data

In [41]:
maf_corr = maf_data[['Aircraft', 'Received Date', 'corr_action']].drop_duplicates()

maf_corr.head()

For each Aircraft repair, we will add the frequncy of the MSP codes in the 30 days prior

In [65]:
msp_corr = []

for repair in maf_corr.iterrows():
    
    # matching aircraft
    
    msp_subset = msp_data[['Aircraft', 'MSP', 'Fault Date']][msp_data['Aircraft'] == repair[1][0]]
    
    # filtering out MSP codes outside of 30-day window prior to repair
    
    msp_subset = msp_subset[(repair[1][1] - msp_subset['Fault Date'] < np.timedelta64(30, 'D')) & (repair[1][1] - msp_subset['Fault Date'] > np.timedelta64(0, 'D'))]
    
    # adding repair date and corrosion flag
    
    msp_subset['Repair Date'] = repair[1][1]
    
    msp_subset['Corrosion'] = repair[1][2]
    
    # counting freq of each MSP code
    
    msp_counts = msp_subset.groupby(['Aircraft', 'Repair Date', 'MSP', 'Corrosion']).size().reset_index(name='freq')
    
    msp_corr.append(msp_counts)

    
msp_corr = pd.concat(msp_corr)


Save MSP count file to csv

In [68]:
msp_corr.to_csv('../../HTM_data/msp_freq.csv')

Convert Corrosion to T/F

In [82]:
msp_corr['Corrosion'] = msp_corr['Corrosion'] == 1.0

Convert Aircraft to Category

In [86]:
msp_corr['Aircraft'] = msp_corr['Aircraft'].astype('category')

t-test to select MSP codes for predictors in model

In [72]:
from scipy.stats import ttest_ind

In [75]:
msp_codes = msp_corr['MSP'].drop_duplicates()

In [116]:
msp_tests = []

for msp in msp_codes:
    
    # 
    
    corrosion_counts = msp_corr[(msp_corr['MSP'] == msp) & msp_corr['Corrosion']]['freq']
    
    non_counts = msp_corr[(msp_corr['MSP'] == msp) & msp_corr['Corrosion'] == False]['freq']
    
    t_test = ttest_ind(corrosion_counts, non_counts)
    
    
    results = pd.DataFrame(data = {'msp': [msp], 
                                   'test': [t_test[1]]})
    
    msp_tests.append(results)
    

msp_tests = pd.concat(msp_tests)
    
    

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


We will use MSP codes w/ p-values < 0.001

In [127]:
msp_preds = msp_tests['msp'][msp_tests['test'] < 0.001]

In [129]:
msp_corr = msp_corr[msp_corr['MSP'].isin(msp_preds)]

In [276]:
msp_corr.head()

MSP,Aircraft,Repair Date,Corrosion,FV,FVFVJD,FVFVXU,FVHPHP,FVHPJD,FVMQVB,FVQG,...,UXMQNV,UXUXMQ,VBQGQG,XUFV,XUHPHP,XUXU,XUXUJD,XUXUVB,ZJMZBM,ZJMZTL
0,1.0,2013-09-24,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,2013-10-03,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,2013-10-04,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,2013-10-16,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,2013-10-17,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [161]:
msp_corr = msp_corr.drop_duplicates(['Aircraft', 'Repair Date', 'Corrosion'])

In [183]:
msp_corr = msp_corr.pivot_table(index = ['Aircraft', 'Repair Date', 'Corrosion'], columns = 'MSP', values = 'freq', fill_value = 0).reset_index()

In [277]:
msp_corr.head()

MSP,Aircraft,Repair Date,Corrosion,FV,FVFVJD,FVFVXU,FVHPHP,FVHPJD,FVMQVB,FVQG,...,UXMQNV,UXUXMQ,VBQGQG,XUFV,XUHPHP,XUXU,XUXUJD,XUXUVB,ZJMZBM,ZJMZTL
0,1.0,2013-09-24,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,2013-10-03,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,2013-10-04,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,2013-10-16,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,2013-10-17,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Save MSP count file to csv

In [189]:
msp_corr.to_csv('../../HTM_data/htm_predict.csv')

Selecting the first 35 aircrafts to train, last 10 to test

In [197]:
train_data = msp_corr[msp_corr['Aircraft'].isin(range(36))]

x_train = train_data.drop(['Aircraft', 'Repair Date', 'Corrosion'], axis=1)
y_train = train_data['Corrosion']

test_data = msp_corr[msp_corr['Aircraft'].isin(range(36,46))]

x_test = test_data.drop(['Aircraft', 'Repair Date', 'Corrosion'], axis=1)
y_test = test_data['Corrosion']

Ridge Logistic Regression Model

In [290]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix, accuracy_score

# ignore convergance warnings
from warnings import filterwarnings
filterwarnings('ignore')

Fitting model 

In [291]:
lasso = LogisticRegressionCV(penalty='l2', class_weight='balanced', cv = 5, max_iter=250).fit(x_train, y_train)

In [293]:
accuracy_score(y_test, lasso.predict(x_test))

0.5985009369144285

In [294]:
confusion_matrix(y_test, lasso.predict(x_test))

array([[4647, 3071],
       [ 143,  144]])

Training Penalized SVM

In [295]:
from sklearn.svm import SVC

In [296]:
svm_pen = SVC(kernel='linear', class_weight='balanced').fit(x_train, y_train)

In [297]:
accuracy_score(y_test, svm_pen.predict(x_test))

0.37064334790755776

In [298]:
confusion_matrix(y_test, svm_pen.predict(x_test))

array([[2766, 4952],
       [  86,  201]])

To account for unbalanced data: upsampling postive cases