In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib

from scipy import stats
import time
import datetime
import os

In [3]:
# Import data
dataset = pd.read_csv('Final_Dataset.csv')

In [4]:
# Drop MPG and PM columns
dataset = dataset.drop(columns=['PM', 'MPG'])
print('Dataset shape: {}'.format(dataset.shape))
print('-----------------------')
rows = dataset.shape[0]

# Drop all rows that have zeros
dataset = dataset[dataset.HC != 0]
print('Dropping zeros from HC')
print('Dropped {} rows'.format(rows - dataset.shape[0]))
print('Dataset shape: {}'.format(dataset.shape))
print('-----------------------')
rows = dataset.shape[0]

dataset = dataset[dataset.CO != 0]
print('Dropping zeros from CO')
print('Dropped {} rows'.format(rows - dataset.shape[0]))
print('Dataset shape: {}'.format(dataset.shape))
print('-----------------------')
rows = dataset.shape[0]

dataset = dataset[dataset.CO2 != 0]
print('Dropping zeros from CO2')
print('Dropped {} rows'.format(rows - dataset.shape[0]))
print('Dataset shape: {}'.format(dataset.shape))
print('-----------------------')
rows = dataset.shape[0]

dataset = dataset[dataset.Nox != 0]
print('Dropping zeros from NOX')
print('Dropped {} rows'.format(rows - dataset.shape[0]))
print('Dataset shape: {}'.format(dataset.shape))

Dataset shape: (95568, 22)
-----------------------
Dropping zeros from HC
Dropped 1124 rows
Dataset shape: (94444, 22)
-----------------------
Dropping zeros from CO
Dropped 2791 rows
Dataset shape: (91653, 22)
-----------------------
Dropping zeros from CO2
Dropped 11 rows
Dataset shape: (91642, 22)
-----------------------
Dropping zeros from NOX
Dropped 13503 rows
Dataset shape: (78139, 22)


In [5]:
# Scale date from 0 to 1

# Create an empty list to put all the scalers
scalers = []

# Create different scalers for each feature in the dataset
for i in range(np.size(dataset.columns)):
    
    scaler = MinMaxScaler(feature_range=(0,1))
    scalers.append(scaler)

# Create a copy to avoid damaging the original data
data_scaled = dataset.copy()

# Fill NAN values with 0
data_scaled = data_scaled.fillna(value=0)

# Scale all the features from 0 to 1
for i in range(np.size(dataset.columns)):
    
    col_name = dataset.columns[i]
    
    values = data_scaled[col_name].values
    values = values.astype('float64')
    values = values.reshape(values.shape[0],1)
    
    data_scaled[col_name] = scalers[i].fit_transform(values)
    
    print('Success with feature: {}'.format(col_name))

Success with feature: Year
Success with feature: Vehicle_Code
Success with feature: Manufacturer_Code
Success with feature: Displacement
Success with feature: Fuel_System
Success with feature: Gears
Success with feature: Transmission_Code
Success with feature: ETW
Success with feature: HP
Success with feature: Drive_System_Code
Success with feature: Fuel_Code
Success with feature: V_avg
Success with feature: V_max
Success with feature: V_std
Success with feature: a_pos
Success with feature: a_neg
Success with feature: Peak_pos
Success with feature: Peak_neg
Success with feature: HC
Success with feature: CO
Success with feature: CO2
Success with feature: Nox


In [6]:
# Export scalers for later use
for i in range(np.size(dataset.columns)):
    
    scaler_filename = "Scalers/scaler{}.save".format(i)
    scaler = scalers[i]
    joblib.dump(scaler, scaler_filename) 