# Data Preprocessing
Purpose: Convert text file into readable csv file

- Regression models: How many more cycles an engine will function before it fails?

## Import

In [26]:
import numpy as np
import statistics
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os

## Get Working Directory

In [27]:
os.getcwd()

'C:\\Users\\User\\SmartMaintenanceML'

## Data Extraction

In [28]:
names = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3','s4', 's5', 's6', 's7', 's8',
         's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']

# read training data
train_data = pd.read_csv('Dataset/train_FD001.txt', sep=" ", header=None)
train_data.drop(train_data.columns[[26, 27]], axis=1, inplace=True)
train_data.columns = names

train_data = train_data.sort_values(['id','cycle'])

# read test data
test_data = pd.read_csv('Dataset/test_FD001.txt', sep=" ", header=None)
test_data.drop(test_data.columns[[26, 27]], axis=1, inplace=True)
test_data.columns = names

# read ground truth data
truth_df = pd.read_csv('Dataset/RUL_FD001.txt', sep=" ", header=None)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)

In [29]:
print("This is the size of the train dataset: {} entries and {} features".format(train_data.shape[0], 
                                                                                 train_data.shape[1]))
print("This is the size of the test dataset: {} entries and {} features".format(test_data.shape[0],
                                                                                test_data.shape[1]))
print("This is the size of the truth dataset: {} entries and {} features".format(truth_df.shape[0],
                                                                                 truth_df.shape[1]))

This is the size of the train dataset: 20631 entries and 26 features
This is the size of the test dataset: 13096 entries and 26 features
This is the size of the truth dataset: 100 entries and 1 features


In [30]:
n_turb = train_data["id"].unique().max()
n_train, n_features = train_data.shape
print("There is {} turbines in each dataset".format(n_turb))

There is 100 turbines in each dataset


In [31]:
# Data Labeling - generate column RUL
rul = pd.DataFrame(train_data.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_data = train_data.merge(rul, on=['id'], how='left')
train_data['RUL'] = train_data['max'] - train_data['cycle']
train_data.drop('max', axis=1, inplace=True)

# generate label columns
# when rul=30, label1=1
w1 = 30
train_data['label1'] = np.where(train_data['RUL'] <= w1, 1, 0 )

In [32]:
# MinMax normalization (from 0 to 1)
train_data['cycle_norm'] = train_data['cycle']
cols_normalize = train_data.columns.difference(['id','cycle','RUL','label1'])
min_max_scaler = MinMaxScaler()
norm_train_data = pd.DataFrame(min_max_scaler.fit_transform(train_data[cols_normalize]),
                               columns=cols_normalize, index=train_data.index)
join_data = train_data[train_data.columns.difference(cols_normalize)].join(norm_train_data)
train_data = join_data.reindex(columns = train_data.columns)

print("The size of the train data set is now: {} entries and {} features.".format(train_data.shape[0],
                                                                                  train_data.shape[1]))

train_data.to_csv('Dataset/train.csv', encoding='utf-8',index = None)
print("Train Data saved as Dataset/train.csv")

The size of the train data set is now: 20631 entries and 29 features.


  return self.partial_fit(X, y)


Train Data saved as Dataset/train.csv


In [33]:
# generate RUL
rul = pd.DataFrame(test_data.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
truth_df['id'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)
test_data = test_data.merge(truth_df, on=['id'], how='left')
test_data['RUL'] = test_data['max'] - test_data['cycle']
test_data.drop('max', axis=1, inplace=True)


# generate label columns w0 and w1 for test data
test_data['label1'] = np.where(test_data['RUL'] <= w1, 1, 0 )

# MinMax normalization (from 0 to 1)
test_data['cycle_norm'] = test_data['cycle']
norm_test_data = pd.DataFrame(min_max_scaler.transform(test_data[cols_normalize]),
                              columns=cols_normalize, index=test_data.index)
test_join_data = test_data[test_data.columns.difference(cols_normalize)].join(norm_test_data)
test_data = test_join_data.reindex(columns = test_data.columns)
test_data = test_data.reset_index(drop=True)


print("The size of the test data set is now: {} entries and {} features.".format(test_data.shape[0],
                                                                                 test_data.shape[1]))

test_data.to_csv('Dataset/test.csv', encoding='utf-8',index = None)
print("Test Data saved as Dataset/test.csv")

The size of the test data set is now: 13096 entries and 29 features.
Test Data saved as Dataset/test.csv


In [None]:
Neural Network is supervised classification that aim for accuracy