The data and the description:
https://archive.ics.uci.edu/ml/datasets/APS+Failure+at+Scania+Trucks

Abstract: The datasets' positive class consists of component failures for a specific component of the APS system. The negative class consists of trucks with failures for components not related to the APS.


In [1]:
!pip install keras
!pip install tensorflow

import numpy as np
import pandas as pd

import os
import pickle
import requests as req
from io import BytesIO
import boto3

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')

import predictions as pr



In [2]:
REGION = 'us-east-1'
BUCKET_NAME = 'mma-models-in-production' # Unique through all accounts
FOLDER = 'dist' # Should be clean before the execution

In [3]:
df_original = pd.read_csv('data/aps_failure_training_set.csv', dtype = 'str')
df_original = df_original.replace(r'na', 0, regex=True)
df_original = df_original.drop(df_original.iloc[:, 5:-1],axis = 1)
df_original

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,eg_000
0,neg,76698,0,2130706438,280,0
1,neg,33058,0,0,0,0
2,neg,41040,0,228,100,0
3,neg,12,0,70,66,32
4,neg,60874,0,1368,458,0
...,...,...,...,...,...,...
59995,neg,153002,0,664,186,0
59996,neg,2286,0,2130706538,224,0
59997,neg,112,0,2130706432,18,0
59998,neg,80292,0,2130706432,494,0


In [50]:
from random import randint, uniform

df_original['aa_000'] = df_original['aa_000'].astype(float)
df_original['ab_000'] = df_original['ab_000'].astype(float)
df_original['ac_000'] = df_original['ac_000'].astype(float)
df_original['ad_000'] = df_original['ad_000'].astype(float)
df_original['eg_000'] = df_original['eg_000'].astype(int)

aa_000 = uniform(0,df_original['aa_000'].max())
ab_000 = uniform(0,df_original['ab_000'].max())
ac_000 = uniform(0,df_original['ac_000'].max())
ad_000 = uniform(0,df_original['ad_000'].max())
eg_000 = uniform(0,df_original['eg_000'].max())
sample = [aa_000, ab_000, ac_000, ad_000, eg_000]
np.array(sample).dtype

dtype('float64')

In [29]:
df_original['aa_000'].max()

2746564

In [4]:
def fit_normalizer(input_data: pd.DataFrame) -> StandardScaler:
    """
        Fit a scikit-learn Normalizer based on input_data.
    """    
    scaler = StandardScaler()
    print('Fitting a Normalizer with given input')    
    scaler.fit(input_data)
    
    file_name = 'normalizer.pkl'
    with open(os.path.join(FOLDER, file_name), 'wb') as f:
        pickle.dump(scaler, f)
    
    # TODO Part 2
    # au.upload_to_s3(BUCKET_NAME, FOLDER, file_name)        
    
    print('Normalizer saved')
    return scaler

In [5]:
normalizer = fit_normalizer(df_original.drop("class", axis=1))

Fitting a Normalizer with given input
Normalizer saved


In [6]:
X = normalizer.transform(df_original.drop("class", axis=1))

In [7]:
def fit_encoder(target_list: list) -> LabelEncoder:
    """
        Fit a scikit-learn LabelEncoder based on target_list.
    """    
    encoder = LabelEncoder()
    print('Fitting a LabelEncoder with given target')
    encoder.fit(target_list)
    
    print('Found classes', encoder.classes_)
    print('Testing encoder', encoder.transform(encoder.classes_))
    
    file_name = 'encoder.pkl'
    with open(os.path.join(FOLDER, file_name), 'wb') as f:
        pickle.dump(encoder, f)

    # TODO Part 2        
    # au.upload_to_s3(BUCKET_NAME, FOLDER, file_name)        
    
    print('Encoder saved')
    return encoder

In [8]:
# All the target column
target_list = df_original['class'].tolist()
encoder = fit_encoder(target_list)

Fitting a LabelEncoder with given target
Found classes ['neg' 'pos']
Testing encoder [0 1]
Encoder saved


In [9]:
def encode_and_one_hot_target(target_list: list, encoder: LabelEncoder) -> np.ndarray:
    print('Encoding target with given encoder')
    targets_encoded = encoder.transform(target_list)
    
    print('Target final shape', targets_encoded.shape)
    return targets_encoded

In [10]:
Y = encode_and_one_hot_target(target_list, encoder)

Encoding target with given encoder
Target final shape (60000,)


The set is very unbalanced with one label (0) being more frequent than the other (1). The algorithm needs to adjust for that. 
It is done using 'class_weight' hyperparameter which is the ratio of number of 0s to 1s in the label.

In [11]:
X_tr, X_t, Y_tr, Y_t = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [12]:
def fit_lr(X_tr: np.ndarray, Y_tr: np.ndarray) -> LogisticRegression:
    """
        Fits a LR model, saves the model weights and returns it.
    """
    weight = sum(Y_tr == 0)/sum(Y_tr == 1)
    
    # Instantiate the model learning model
    lr_full = LogisticRegression(C = 1, class_weight={1:weight}, random_state = 0)
    
    # Fitting the model
    print('Fitting model')
    model = lr_full.fit(X_tr, Y_tr)
    
    file_name = 'model.pkl'
    with open(os.path.join(FOLDER, file_name), 'wb') as f:
        pickle.dump(model, f)
      
    # TODO Part 2    
    # au.upload_to_s3(BUCKET_NAME, FOLDER, file_name)        

    print('Model saved')
    return model

In [13]:
model = fit_lr(X_tr, Y_tr)

Fitting model
Model saved


In [14]:
Y_pr = model.predict(X_t)

In [15]:
accuracy_score(Y_t, Y_pr)

0.9545833333333333

In [16]:
#calculate the score using confusion matrix values
def score(cm):
    cm_score = cm[0][1] * 10 + cm[1][0] * 500
    cm_score = int(cm_score * 1.33) #1.33 is because the actual test set is 33% larger than this test set
    return cm_score
#calculate confusion matrix
cm = confusion_matrix(Y_t, Y_pr)
score(cm)

21585

21585 is our basic score.