# <center>Feature Selection</center>

**i) Split the dataset in train and test purpose**<br>
**ii) Select only those features which are helpful for model training.**

In [11]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
# Load Dataset
df = pd.read_csv('Steel_Feature_Engineering.csv')
df.head()

Unnamed: 0,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type,day,month,year,hour,minute
0,3.17,2.95,0.0,0.0,73.21,100.0,900,1,1,0,1,1,2018,0,15
1,4.0,4.46,0.0,0.0,66.77,100.0,1800,1,1,0,1,1,2018,0,30
2,3.24,3.28,0.0,0.0,70.28,100.0,2700,1,1,0,1,1,2018,0,45
3,3.31,3.56,0.0,0.0,68.09,100.0,3600,1,1,0,1,1,2018,1,0
4,3.82,4.5,0.0,0.0,64.72,100.0,4500,1,1,0,1,1,2018,1,15


In [3]:
## To prevent data leakage we need to split the data first and then apply Feature Engineering but for 
## experiment purpose it is done after feature engineering
## beside that it also should be taken care that no data leakage happens here
from sklearn.model_selection import train_test_split
x = df.drop(columns = 'Usage_kWh')
y =df['Usage_kWh']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [4]:
## Prepare test data
test_data = x_test.copy()
test_data['Usage_kWh'] = y_test
test_data.head()

Unnamed: 0,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type,day,month,year,hour,minute,Usage_kWh
11356,3.82,0.0,0.0,60.2,100.0,26100,0,0,0,29,4,2018,7,15,2.88
26543,48.02,0.0,0.03,78.46,100.0,43200,1,4,2,10,4,2018,12,0,60.77
2445,59.65,0.0,0.06,89.61,100.0,41400,1,5,2,26,1,2018,11,30,120.42
14649,0.0,16.6,0.0,100.0,18.53,52200,0,6,0,6,2,2018,14,30,3.13
32699,20.99,0.0,0.03,94.19,100.0,54000,1,5,1,12,7,2018,15,0,58.86


In [5]:
test_data.to_csv('test.csv', index = False)

In [6]:
## Drop Unnecessary Columns
columns_to_drop = ['Day_of_week', 'day', 'month', 'year', 'minute']
data = x_train.copy()
data.drop(columns = columns_to_drop, inplace = True)
data.head()

Unnamed: 0,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Load_Type,hour
31511,3.1,0.0,0.0,75.14,100.0,21600,0,0,6
24969,5.4,0.0,0.0,47.94,100.0,9000,1,0,2
15561,5.51,0.0,0.0,49.03,100.0,9000,1,0,2
23246,5.4,0.0,0.0,45.25,100.0,13500,1,0,3
6870,46.26,0.0,0.05,91.75,100.0,49500,1,2,13


In [7]:
continuous = []
discrete = []
for feature in data.columns:
    if feature not in ['WeekStatus', 'Load_Type', 'hour']:
        continuous.append(feature)
    elif feature == 'Usage_kWh':
        continue
    else:
        discrete.append(feature)
print("Continuous Features: ",continuous)
print("Discrete Features: ", discrete)

Continuous Features:  ['Lagging_Current_Reactive.Power_kVarh', 'Leading_Current_Reactive_Power_kVarh', 'CO2(tCO2)', 'Lagging_Current_Power_Factor', 'Leading_Current_Power_Factor', 'NSM']
Discrete Features:  ['WeekStatus', 'Load_Type', 'hour']


In [8]:
## Feature scaling
from sklearn.preprocessing import StandardScaler
cols = data.columns
scaler = StandardScaler()
data.iloc[:, :-3] = scaler.fit_transform(data.iloc[:, :-3])
data.head()

Unnamed: 0,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Load_Type,hour
31511,-0.607869,-0.522712,-0.712161,-0.287349,0.514084,-0.845782,0,0,6
24969,-0.466496,-0.522712,-0.712161,-1.723965,0.514084,-1.351516,1,0,2
15561,-0.459735,-0.522712,-0.712161,-1.666394,0.514084,-1.351516,1,0,2
23246,-0.466496,-0.522712,-0.712161,-1.866042,0.514084,-1.170897,1,0,3
6870,2.045033,-0.522712,2.389134,0.589937,0.514084,0.274057,1,2,13


In [9]:
data['Usage_kWh'] = y_train

In [10]:
data.to_csv('train.csv', index = False)

In [12]:
# Save standardization model
import pickle
filename = 'scaler.pkl'
pickle.dump(scaler, open(filename, 'wb'))