In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In the workshop for this week, you are to select a data set from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/APS+Failure+at+Scania+Trucks) 




From the website, a description of the data set.

"The dataset consists of data collected from heavy Scania 
trucks in everyday usage. The system in focus is the 
Air Pressure system (APS) which generates pressurised 
air that are utilized in various functions in a truck, 
such as braking and gear changes. The datasets' 
positive class consists of component failures 
for a specific component of the APS system. 
The negative class consists of trucks with failures 
for components not related to the APS. The data consists 
of a subset of all available data, selected by experts. "


"ttribute Information:

Attribute Information: 
The attribute names of the data have been anonymized for 
proprietary reasons."


Data ingestion

In [None]:

data = pd.read_csv('aps_failure_training_set.csv',skiprows=20)


In the next block, I am wrangling the data, converting the target column into a 0 and a 1, 0 when the target class is negative and 1 when the target class is positive. 

I then remove the class column, and then make sure all of the columns are numeric. 

In [None]:
df = data
df['target'] = np.where(df['class']=='neg', 0, 1)
df = df.loc[:, df.columns != 'class']


for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')


In [None]:
# Describe the dataset
print(df.describe())
print(df.info())

Here I am splitting the attribute columns from the target column.

Then, if there are any missing values, I using the mean of that column to fill the values.

Then, I am scaling the columns, so that columns with larger numbers don't impact the model columns with lower numbers. 

In [None]:

      
X = df.loc[:, df.columns != 'target']
y = df['target']



X = X.fillna(X.mean())

scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))

X = scaler.fit_transform(X)



In the block below, I am creating the Kfold indexes to do the cross validation.

In [None]:


cv = KFold(n_splits=10,random_state=42,shuffle=False)


Now the data is read to fit a model. I create a dictionary to hold the scores for evaluating the models. 

Then I create an instance of the model.

Then I use the cross validation I created above to create a test and train split to build the model. The modelis then fit, and then used to predict the the target on the test set. The expected target value is then used for the the evaluation. 

I do this for a random forest model first, and the for a Support Vector Classification model. 

I have written out the code for a K nearest neighbors model, however it took too long to run on my machine. If I had more time or more computing power, I would run this code to try and see if it performed any better. 

In [None]:
random_forest_scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}

best_random = RandomForestClassifier()


for train_index, test_index in cv.split(X):
  print("Train Index:",train_index,"\n")
  print("Test Index:",test_index)
  X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
  best_random.fit(X_train, y_train)
  predicted = best_random.predict(X_test)
  expected  = y_test
  random_forest_scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
  random_forest_scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
  random_forest_scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
  random_forest_scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))


In [None]:
print("Validation scores  for the random forrest model are as follows:\n")

print(pd.DataFrame(random_forest_scores).mean())



In [None]:
svc_scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}

best_svc = SVC()


for train_index, test_index in cv.split(X):
  print("Train Index:",train_index,"\n")
  print("Test Index:",test_index)
  X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
  best_svc.fit(X_train, y_train)
  predicted = best_svc.predict(X_test)
  expected  = y_test
  svc_scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
  svc_scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
  svc_scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
  svc_scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))


In [None]:

print("Validation scores for the svc model are as follows:\n")

print(pd.DataFrame(svc_scores).mean())



knn_scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}

best_knn = KNeighborsClassifier(n_neighbors=12)


for train_index, test_index in cv.split(X):
  print("Train Index:",train_index,"\n")
  print("Test Index:",test_index)
  X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
  best_knn.fit(X_train, y_train)
  predicted = best_knn.predict(X_test)
  expected  = y_test
  knn_scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
  knn_scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
  knn_scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
  knn_scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))



print("Validation scores for the knn are as follows:\n")

print(pd.DataFrame(knn_scores).mean())


