# AirFrance Hackathon

### Gabriel A. Moreira

In [12]:
# lstm model
import numpy as np
from numpy import mean
from numpy import std
from numpy import dstack
import pandas as pd
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import to_categorical
from matplotlib import pyplot
import random
from tensorflow import keras
from sklearn.decomposition import PCA

df_train = pd.read_csv('/home/gabriel/Desktop/hackathonAirFrance-master/data/train_data.csv', sep=',')
df_test = pd.read_csv('/home/gabriel/Desktop/hackathonAirFrance-master/data/test_data.csv', sep=',')

## Principal Component Analysis

In [14]:
df = df_train
for i in range(0,len(list(df))):
    if i > 1 and i < 26:    
        x = df.iloc[:,i]
        df.iloc[:,i] = (x-min(x))/(max(x)-min(x))

dfpca = pd.DataFrame()

pca = PCA(n_components=4)
pca_result = pca.fit_transform(df[['op_setting_1','op_setting_2','op_setting_3','sensor_1','sensor_2','sensor_3','sensor_4','sensor_5','sensor_6','sensor_7','sensor_8','sensor_9','sensor_10','sensor_11','sensor_12','sensor_13','sensor_14','sensor_15','sensor_16','sensor_17','sensor_18','sensor_19','sensor_20','sensor_21']].values)

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))
dfpca['engine_no'] = df['engine_no']
dfpca['pca-one'] = pca_result[:,0]
dfpca['pca-two'] = pca_result[:,1] 
dfpca['pca-three'] = pca_result[:,2]
dfpca['pca-four'] = pca_result[:,3]

Explained variation per principal component: [0.84986045 0.12830408 0.01633434 0.00319357]


In [16]:
df = dfpca
N_ENGINES = min(max(df_test['engine_no'].tolist())+1, max(df_train['engine_no'].tolist())+1)
SEQUENCE_LENGTH = 20

707


## Data generation functions

In [17]:
def normalize_df(df):
    new_df = df
    for i in range(0,len(list(df))):
        if i > 1 and i < 26:
            x = df.iloc[:,i]
            df.iloc[:,i] = (x-min(x))/(max(x)-min(x))
    new_df.drop(df.columns[26:33], axis=1, inplace=True)
    new_df.drop(df.columns[0:3], axis=1, inplace=True)
    return new_df

def get_engine_data(df, engine, test=False, normalized=True):
    df = df.copy(deep=True)
    df = normalize_df(df)
    
    if not test:
        X_data = df.loc[df['engine_no'] == engine].iloc[:,2:-1]
        Y_data = df.loc[df['engine_no'] == engine].iloc[:,-1]
        return [X_data.reset_index(drop=True), Y_data.reset_index(drop=True)]
    else:
        X_data = df.loc[df['engine_no'] == engine].iloc[:,2:]
        return X_data.reset_index(drop=True)
      
def generate_train_dataset(df):
    x = []
    y = []
    for engine in range(N_ENGINES):
        rule = [True, False]
        slice_to_fail = random.choice(rule)
        total_length = df.loc[df.iloc[:,0] == engine].shape[0]
        if total_length > SEQUENCE_LENGTH:
            if slice_to_fail: 
                for i in range(50):
                    if total_length > 100 + SEQUENCE_LENGTH:
                        seq_df = df.loc[df['engine_no'] == engine]
                        seq_df.reset_index(drop=True)
                        total_length = seq_df.shape[0]
                        seq_df = seq_df[total_length-100-SEQUENCE_LENGTH+i:total_length-100+i]
                        seq_df = seq_df.reset_index(drop=True)
                        seq_df = seq_df.iloc[:,1:]
                        x.append(seq_df)
                        y.append(1)

                if total_length < 100 + SEQUENCE_LENGTH:
                    seq_df = df.loc[df['engine_no'] == engine]
                    seq_df.reset_index(drop=True)
                    total_length = seq_df.shape[0]
                    seq_df = seq_df[total_length-SEQUENCE_LENGTH:total_length]
                    seq_df = seq_df.reset_index(drop=True)
                    seq_df = seq_df.iloc[:,1:] 
                    x.append(seq_df)
                    y.append(1)
            else:
                for i in range(total_length-100-SEQUENCE_LENGTH):
                    if total_length > 100 + SEQUENCE_LENGTH:
                        seq_df = df.loc[df['engine_no'] == engine]
                        seq_df.reset_index(drop=True, inplace=True)
                        seq_df = seq_df[0+i:SEQUENCE_LENGTH+i]
                        seq_df = seq_df.reset_index(drop=True)
                        seq_df = seq_df.iloc[:,1:] 
                        x.append(seq_df)
                        y.append(0)
    return [x, y]

In [18]:
a = generate_train_dataset(df)
x = a[0]
y = a[1]

## Setting up the test and train data

In [23]:
N_POINTS = 18000
mask = np.arange(0,len(x), len(x)/N_POINTS, dtype=int)

x = list(x[i] for i in mask)
y = list(y[i] for i in mask)

train_indices = random.sample(range(len(x)), int(0.7*len(x)))
test_indices = [i for i in range(len(train_indices)) if i not in train_indices]

trainX = np.empty((len(train_indices), SEQUENCE_LENGTH, 4))
testX = np.empty((len(x) - len(train_indices), SEQUENCE_LENGTH, 4))
trainy = np.empty((len(train_indices), 1))
testy = np.empty((len(x) - len(train_indices), 1))

ii = sorted(train_indices)
    
for i in range(len(train_indices)):
    trainX[i,:,:] = x[ii[i]].values
    trainy[i,:] = y[ii[i]]

for i in range(len(test_indices)):
    testX[i,:,:] = x[ii[i]].values
    testy[i,:] = y[ii[i]]
    
from keras.utils import to_categorical
trainy = to_categorical(trainy)
testy = to_categorical(testy)

In [24]:
trainy.shape

(12600, 2)

In [25]:
testy.shape

(5400, 2)

## Recursive Neural Network - LSTM implementation with Keras

In [26]:
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adam

def fit_model(trainX, trainy, testX, testy):
    verbose, epochs, batch_size = 0, 15, 128
    n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], 2
    model = Sequential()
    model.add(LSTM(100, input_shape=(n_timesteps,n_features)))
    model.add(Dropout(0.7))
    model.add(Dense(64))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dense(n_outputs, activation='softmax'))

    opt = Adam(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, verbose=1)
    
    return model
 
def summarize_results(scores):
    print(scores)
    m, s = mean(scores), std(scores)
    print('Accuracy: %.3f%% (+/-%.3f)' % (m, s))
 
def run_training(trainX, trainy, testX, testy, repeats=2):
    batch_size = 128
    # repeat experiment
    scores = list()
    for r in range(repeats):
        model = fit_model(trainX, trainy, testX, testy)
        _, accuracy = model.evaluate(testX, testy, batch_size=batch_size, verbose=1)
        score = accuracy * 100.0
        print('>#%d: %.3f' % (r+1, score))
        scores.append(score)
    # summarize results
    summarize_results(scores)
    
    return model

### Training the model

In [27]:
model = run_training(trainX, trainy, testX, testy)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
>#1: 63.148
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
>#2: 70.389
[63.14814814594057, 70.38888888005857]
Accuracy: 66.769% (+/-3.620)


## Setting up the test data

In [28]:
df = df_test

for i in range(0,len(list(df))):  
    if i > 1 and i < 26:      
        x = df.iloc[:,i]
        df.iloc[:,i] = (x-min(x))/(max(x)-min(x))

dfpca = pd.DataFrame()
pca = PCA(n_components=4)
pca_result = pca.fit_transform(df[['op_setting_1', 'op_setting_2',
                                   'op_setting_3', 'sensor_1',
                                   'sensor_2', 'sensor_3',
                                   'sensor_4', 'sensor_5',
                                   'sensor_6','sensor_7',
                                   'sensor_8','sensor_9',
                                   'sensor_10','sensor_11',
                                   'sensor_12','sensor_13',
                                   'sensor_14','sensor_15',
                                   'sensor_16','sensor_17',
                                   'sensor_18','sensor_19',
                                   'sensor_20','sensor_21']].values)

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))
dfpca['engine_no'] = df['engine_no']
dfpca['pca-one'] = pca_result[:,0]
dfpca['pca-two'] = pca_result[:,1] 
dfpca['pca-three'] = pca_result[:,2]
dfpca['pca-four'] = pca_result[:,3]

df = dfpca

def _generate_test_dataset(df):
    engine_data = []
    engine_number = []
    for engine in range(N_ENGINES):
        seq_df = df.loc[df.iloc[:,0] == engine]
        seq_df.reset_index(drop=True, inplace=True)
        total_length = seq_df.shape[0]
        if total_length > SEQUENCE_LENGTH:
            seq_df = seq_df[total_length-SEQUENCE_LENGTH:]
            seq_df.reset_index(drop=True, inplace=True)
            seq_df = seq_df.iloc[:,1:]
            engine_number.append(engine)
            engine_data.append(seq_df)
    return [engine_number, engine_data]

a = _generate_test_dataset(df)

eng_numb = a[0]
eng_data = a[1]

print(eng_data[0].values.shape)

Explained variation per principal component: [0.85073548 0.12828674 0.01769858 0.00151942]
(20, 4)


In [29]:
X = np.empty((1, SEQUENCE_LENGTH, 4))
results = []

for i in range(len(eng_data)):
    X[0,:,:] = eng_data[i].values
    prediction = model.predict(X)
    f = 1-np.argmax(prediction[0])
    results.append(f)

[[0.08432797 0.91567206]]
[[0.22428171 0.7757183 ]]
[[0.6735742  0.32642573]]
[[0.7318171  0.26818287]]
[[0.18222284 0.8177771 ]]
[[0.6934476 0.3065524]]
[[0.69619244 0.30380756]]
[[0.24522673 0.75477326]]
[[0.03666515 0.96333486]]
[[0.6720598  0.32794023]]
[[0.6730449  0.32695502]]
[[0.09804195 0.901958  ]]
[[0.68361866 0.3163813 ]]
[[0.6852326  0.31476742]]
[[0.710335 0.289665]]
[[0.6718576  0.32814243]]
[[0.69679284 0.3032072 ]]
[[0.68023187 0.3197681 ]]
[[0.6726025  0.32739747]]
[[0.6787678 0.3212322]]
[[0.6761888  0.32381117]]
[[0.6888351  0.31116495]]
[[0.31964985 0.6803502 ]]
[[0.69636524 0.30363473]]
[[0.697402   0.30259803]]
[[0.6713407 0.3286593]]
[[0.67201406 0.32798597]]
[[0.71564484 0.28435516]]
[[0.69285625 0.30714375]]
[[0.68130857 0.31869146]]
[[0.3000998  0.69990015]]
[[0.6735033  0.32649672]]
[[0.6719555  0.32804453]]
[[0.6936692  0.30633086]]
[[0.66474676 0.33525327]]
[[0.71332675 0.2866732 ]]
[[0.6788 0.3212]]
[[0.672197  0.3278031]]
[[0.72072566 0.2792744 ]]
[[0.69

[[0.59669936 0.40330064]]
[[0.66096425 0.33903578]]
[[0.6726674  0.32733265]]
[[0.69378275 0.3062172 ]]
[[0.6716197  0.32838026]]
[[0.71312493 0.28687504]]
[[0.7043822  0.29561782]]
[[0.7319289 0.2680711]]
[[0.5684682 0.4315318]]
[[0.67851454 0.32148546]]
[[0.67246497 0.32753497]]
[[0.6928317  0.30716833]]
[[0.6724955 0.3275045]]
[[0.6728871 0.3271129]]
[[0.1626742 0.8373258]]
[[0.6906729  0.30932713]]
[[0.67818296 0.32181698]]
[[0.41570598 0.584294  ]]
[[0.6724936  0.32750645]]
[[0.67228866 0.32771134]]
[[0.14318158 0.85681844]]
[[0.6813407  0.31865928]]
[[0.6720577 0.3279423]]
[[0.70408136 0.29591867]]
[[0.6835637  0.31643623]]
[[0.696584   0.30341595]]
[[0.6878845 0.3121155]]
[[0.6796394  0.32036063]]
[[0.6855701  0.31442985]]
[[0.67240655 0.32759348]]
[[0.68199486 0.3180051 ]]
[[0.03253253 0.9674675 ]]
[[0.72085315 0.2791468 ]]
[[0.6349909  0.36500916]]
[[0.71405375 0.2859462 ]]
[[0.6735083  0.32649174]]
[[0.67258406 0.32741594]]
[[0.18019871 0.81980133]]
[[0.2685419 0.7314581]]
[[

## Save results to .csv file

In [41]:
res = pd.DataFrame()
res['engine_no'] = list(range(0,707))

r = np.zeros(707)
for i in range(len(eng_numb)):
    r[eng_numb[i]] = results[i]

rr = np.asarray(r, dtype=int)
res['result'] = rr
res.to_csv('submission.csv', sep=',', encoding='utf-8', index=False)