## Importing necessary libraries

In [1]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

from keras.models import Sequential
from keras.layers import Dense

## Importing dataset

In [2]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

data = pd.read_csv('gas_turbines.csv',sep=',')
data.head()

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,CO,NOX
0,6.8594,1007.9,96.799,3.5,19.663,1059.2,550.0,114.7,10.605,3.1547,82.722
1,6.785,1008.4,97.118,3.4998,19.728,1059.3,550.0,114.72,10.598,3.2363,82.776
2,6.8977,1008.8,95.939,3.4824,19.779,1059.4,549.87,114.71,10.601,3.2012,82.468
3,7.0569,1009.2,95.249,3.4805,19.792,1059.6,549.99,114.72,10.606,3.1923,82.67
4,7.3978,1009.7,95.15,3.4976,19.765,1059.7,549.98,114.72,10.612,3.2484,82.311


## Data Exploration and Data Understanding

### Understanding about data

Gas turbines are used to produce electricity by  using natural gas and compressed air. Here, natural gas is injected which mixes with the compressed air and ignited. The combustion produces a high-pressure, hot gas stream that flows through the turbine causing it to spin (at tremendous speeds). Consequently, this spins a generator which is connected to the turbine to produce electricity.

The Output of the power plant is dependent on few parameters which are atmospheric pressure, exhaust steam pressure, ambient temperature, and relative humidity.

CO and NOX are basically the emissions from the gas Turbine. 


Attribute Information:

The explanations of sensor measurements and their brief statistics are given below.

* Variable (Abbr.) Unit Min Max Mean
* Ambient temperature (AT) C â€“6.23 37.10 17.71
* Ambient pressure (AP) mbar 985.85 1036.56 1013.07
* Ambient humidity (AH) (%) 24.08 100.20 77.87
* Air filter difference pressure (AFDP) mbar 2.09 7.61 3.93
* Gas turbine exhaust pressure (GTEP) mbar 17.70 40.72 25.56
* Turbine inlet temperature (TIT) C 1000.85 1100.89 1081.43
* Turbine after temperature (TAT) C 511.04 550.61 546.16
* Compressor discharge pressure (CDP) mbar 9.85 15.16 12.06
* Turbine energy yield (TEY) MWH 100.02 179.50 133.51
* Carbon monoxide (CO) mg/m3 0.00 44.10 2.37
* Nitrogen oxides (NOx) mg/m3 25.90 119.91 65.29

#### Input Variables 


* Since we are suppose to take only ambient features for prediction, my input variables will be 
1. Ambient temperature
2. Ambient pressure
3. Ambient humidity

As, all the other features seems to be the gas turbine parameters, I am exculding them as of now. 


#### Output Variable

* Turbine energy yield

In [3]:
data.shape

(15039, 11)

In [4]:
data.isnull().sum()

AT      0
AP      0
AH      0
AFDP    0
GTEP    0
TIT     0
TAT     0
TEY     0
CDP     0
CO      0
NOX     0
dtype: int64

In [5]:
data.dtypes

AT      float64
AP      float64
AH      float64
AFDP    float64
GTEP    float64
TIT     float64
TAT     float64
TEY     float64
CDP     float64
CO      float64
NOX     float64
dtype: object

## Data Preparation

In [6]:
X = data[['AT','AP','AH']]
Y = data[['TEY']]

In [7]:
scalar = StandardScaler()
x_scaled = scalar.fit_transform(X)
x_scaled

array([[-1.4397781 , -0.82664395,  1.28143632],
       [-1.44960109, -0.74864748,  1.30456402],
       [-1.43472138, -0.68625031,  1.21908576],
       ...,
       [-1.38626659, -1.07623263,  1.47697056],
       [-1.42042259, -0.99823616,  1.44159024],
       [-1.43073409, -0.93583899,  1.33465179]])

In [8]:
x_scaled.mean()

-5.997177865936258e-15

In [9]:
x_scaled.std()

1.0

## Model 1 - with three input variables and 1 output variable 

## Model Building 

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x_scaled,Y,test_size=0.20,random_state=12)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((12031, 3), (3008, 3), (12031, 1), (3008, 1))

In [11]:
model = Sequential()
model.add(Dense(3, input_dim=3, activation='relu'))
model.add(Dense(3, activation='relu'))
model.add(Dense(1,activation='linear'))

In [12]:
# Compiling the model 
model.compile(loss='mean_squared_error',optimizer='adam', metrics=['mean_squared_error'])

## Model Training 

In [13]:
# Fit the model
model.fit(x_train, y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1ee41f6e5b0>

## Model Testing

In [14]:
#  prediction using training dataset
y_pedict_train = model.predict(x_train)

In [15]:
# prediction using testing dataset
y_pedict_test = model.predict(x_test)

## Model Evaluation

In [16]:
print(np.sqrt(mean_squared_error(y_train,y_pedict_train)))

14.880785265514385


In [17]:
print(np.sqrt(mean_squared_error(y_test,y_pedict_test))) 

15.003314442879946


#### OBSERVATION 

Our model is achieving a stable performance with not much variance in the train and test set RMSE.

## Model 2 - with 10 input variables and 1 output variable 

## Now Changing my input variables 

### Input Varibles

* Ambient temperature 
* Ambient pressure 
* Ambient humidity 
* Air filter difference pressure 
* Gas turbine exhaust pressure 
* Turbine inlet temperature 
* Turbine after temperature 
* Compressor discharge pressure 
* Carbon monoxide 
* Nitrogen oxides 

### Output variable

* Turbine energy yield

In [18]:
x = data.drop('TEY',axis= 1)
y = data[['TEY']]

In [20]:
scalar1 = StandardScaler()
x_scaled1 = scalar1.fit_transform(x)
x_scaled1

array([[-1.4397781 , -0.82664395,  1.28143632, ..., -1.35733078,
         0.53201195,  1.3878449 ],
       [-1.44960109, -0.74864748,  1.30456402, ..., -1.36367619,
         0.56873344,  1.39300237],
       [-1.43472138, -0.68625031,  1.21908576, ..., -1.36095673,
         0.5529378 ,  1.36358566],
       ...,
       [-1.38626659, -1.07623263,  1.47697056, ..., -1.46792219,
         2.69592467,  2.17006209],
       [-1.42042259, -0.99823616,  1.44159024, ..., -1.42259784,
         1.9246834 ,  2.391165  ],
       [-1.43073409, -0.93583899,  1.33465179, ..., -1.37727349,
         1.35415028,  2.32153907]])

In [21]:
x_scaled1.mean()

-1.8868431874004496e-15

In [22]:
x_scaled1.std()

1.0

## Model Building

In [23]:
X_train,X_test,Y_train,Y_test = train_test_split(x_scaled1,y,test_size=0.20,random_state=12)
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

((12031, 10), (3008, 10), (12031, 1), (3008, 1))

In [25]:
model1 = Sequential()
model1.add(Dense(10, input_dim=10, activation='relu'))
model1.add(Dense(10, activation='relu'))
model1.add(Dense(1,activation='linear'))

In [26]:
# Compiling the model 
model1.compile(loss='mean_squared_error',optimizer='adam', metrics=['mean_squared_error'])

## Model Training

In [27]:
# Fit the model
model1.fit(X_train, Y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1ee433ad790>

## Model Testing

In [28]:
#  prediction using training dataset
y_pedict_train1 = model1.predict(X_train)

In [30]:
# prediction using testing dataset
y_pedict_test1 = model1.predict(X_test)

## Model Evaluation

In [32]:
print(np.sqrt(mean_squared_error(Y_train,y_pedict_train1)))

2.0933301823981685


In [33]:
print(np.sqrt(mean_squared_error(Y_test,y_pedict_test1))) 

2.217993303371691


### OBSERVATION 

Our model is achieving a stable performance with not much variance in the train and test set RMSE.

##   CONCLUSION 

* Second model is better as the error is very less compared to first one