In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_absolute_error

In [2]:
data = pd.read_csv('ps6_trainvalid.csv')

# Data Preprocessing

In [3]:
#Drop first row of null values
data = data.drop(index=0)

In [4]:
data.describe()

Unnamed: 0,temperature,humidity,pressure,wind_direction,wind_speed
count,45010.0,44861.0,44761.0,45012.0,45012.0
mean,290.841547,62.823767,1015.915127,139.733804,1.220652
std,6.465152,21.786687,11.316175,105.545576,1.435687
min,266.503667,5.0,886.0,0.0,0.0
25%,286.37,48.0,1013.0,29.0,0.0
50%,290.53,66.0,1016.0,140.0,1.0
75%,295.08,81.0,1021.0,230.0,2.0
max,315.47,100.0,1044.0,360.0,17.0


In [5]:
#filling null values for temp, pressure, humidity
data['temperature'] = data.temperature.fillna(method='bfill')
data['humidity'] = data.humidity.fillna(method='bfill')
data['pressure'] = data.pressure.fillna(method='bfill')

In [6]:
#Joining wind direction and wind speed to wind vector
wd_rad = data.wind_direction*np.pi/180
data['Wx'] = data.wind_speed*np.cos(wd_rad)
data['Wy'] = data.wind_speed*np.sin(wd_rad)

data = data.drop(['wind_direction','wind_speed'],axis=1)

In [7]:
#Convert date time into hour timestamp
import datetime
data['datetime'] = pd.to_datetime(data['datetime'])
timestamp_h = data['datetime'].map(datetime.datetime.timestamp)/3600

In [8]:
#Adding sin and cos terms for periodicity
day = 24
year = (365.2425)*day

data['Day sin'] = np.sin(timestamp_h * (2 * np.pi / day))
data['Day cos'] = np.cos(timestamp_h * (2 * np.pi / day))
data['Year sin'] = np.sin(timestamp_h * (2 * np.pi / year))
data['Year cos'] = np.cos(timestamp_h * (2 * np.pi / year))

In [9]:
#dropping datetime and weather
data = data.drop(['weather','datetime'],axis=1)

In [10]:
data

Unnamed: 0,temperature,humidity,pressure,Wx,Wy,Day sin,Day cos,Year sin,Year cos
1,291.870000,88.0,1013.0,0.000000,0.000000,-8.660254e-01,5.000000e-01,-0.999924,0.012325
2,291.868186,88.0,1013.0,0.000000,0.000000,-7.071068e-01,7.071068e-01,-0.999915,0.013041
3,291.862844,88.0,1013.0,0.000000,0.000000,-5.000000e-01,8.660254e-01,-0.999905,0.013758
4,291.857503,88.0,1013.0,0.000000,0.000000,-2.588190e-01,9.659258e-01,-0.999895,0.014475
5,291.852162,88.0,1013.0,0.000000,0.000000,-6.304366e-12,1.000000e+00,-0.999885,0.015192
...,...,...,...,...,...,...,...,...,...
45008,295.440000,17.0,1017.0,0.965926,-0.258819,8.660254e-01,5.000000e-01,-0.654451,0.756104
45009,296.020000,16.0,1016.0,0.965926,-0.258819,9.659258e-01,2.588190e-01,-0.653909,0.756573
45010,296.510000,17.0,1015.0,0.965926,-0.258819,1.000000e+00,1.086883e-11,-0.653367,0.757042
45011,297.090000,17.0,1014.0,0.000000,-0.000000,9.659258e-01,-2.588190e-01,-0.652824,0.757510


In [11]:
#Splitting into train, validation and test
column_indices = {name: i for i, name in enumerate(data.columns)}

n = len(data)
train_df = data[0:int(n*0.7)]
val_df = data[int(n*0.7):int(n*0.9)]
test_df = data[int(n*0.9):]

num_features = data.shape[1]

In [12]:
#Normalizing all the data
train_mean = train_df.mean()
train_std = train_df.std()

train_df = (train_df - train_mean) / train_std
val_df = (val_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std

In [13]:
train_df.describe()

Unnamed: 0,temperature,humidity,pressure,Wx,Wy,Day sin,Day cos,Year sin,Year cos
count,31508.0,31508.0,31508.0,31508.0,31508.0,31508.0,31508.0,31508.0,31508.0
mean,-6.534767e-14,-3.831587e-16,1.239591e-14,3.984368e-15,1.11419e-14,1.8499020000000003e-17,-2.873514e-17,4.327079e-16,-1.23941e-15
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-3.657681,-2.560109,-15.1886,-11.26771,-12.03217,-1.414442,-1.414068,-1.437382,-1.554323
25%,-0.6866147,-0.6908878,-0.5952103,-0.5896149,-0.2156248,-1.0002,-0.999895,-1.003597,-0.9630392
50%,-0.05083055,0.1297458,-0.1355759,0.09163101,0.2357689,-0.0001332905,6.089053e-06,0.0259358,0.1121578
75%,0.673128,0.8136072,0.6687842,0.09163101,0.2357689,0.9999339,0.9999072,1.017105,0.9786113
max,3.372504,1.771013,2.966956,13.86943,8.471816,1.414175,1.41408,1.364679,1.322268


# Single Step Prediction

## Dataset creation

In [14]:
# Function to create windows and labels
def create_dataset_single(train_df,window_size):
    train_windows = []
    train_labels = []
    
#     train_df = train_df.to_numpy()

    for i in range(len(train_df)-window_size):
        train_windows.append(train_df.iloc[i:i+window_size])
        train_labels.append(train_df['temperature'].iloc[i+window_size])
    
    return np.array(train_windows),np.array(train_labels)

In [15]:
#train,val and test for windows 24,72 and 120
train_windows24, train_labels24 = create_dataset_single(train_df,24)
train_windows72, train_labels72 = create_dataset_single(train_df,72)
train_windows120, train_labels120 = create_dataset_single(train_df,120)


val_windows24,val_labels24 = create_dataset_single(val_df,24)
val_windows72,val_labels72 = create_dataset_single(val_df,72)
val_windows120,val_labels120 = create_dataset_single(val_df,120)


test_windows24,test_labels24 = create_dataset_single(test_df,24)
test_windows72,test_labels72 = create_dataset_single(test_df,72)
test_windows120,test_labels120 = create_dataset_single(test_df,120)

## Baseline (Last Time Step)

In [16]:
def baseline(data):
    return data[:,-1,0]


baseline_test24 = mean_absolute_error(baseline(test_windows24),test_labels24)
baseline_test72 = mean_absolute_error(baseline(test_windows72),test_labels72)
baseline_test120 = mean_absolute_error(baseline(test_windows120),test_labels120)

In [17]:
baseline_test24,baseline_test72,baseline_test120

(0.13169526830479683, 0.13196778638959025, 0.13142798156378666)

## Linear Model

In [18]:
linear = tf.keras.Sequential([
    tf.keras.layers.Dense(units=1)
])

In [19]:
#Compile and Fit function for all three windows
def compile_and_fit(model):
    model.compile(loss=tf.losses.MeanSquaredError(),metrics=[tf.metrics.MeanAbsoluteError()])
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=2, mode='min')
    
    print('training model 1')
    history24 = model.fit(train_windows24,train_labels24,validation_data=(val_windows24,val_labels24),
                          callbacks=[early_stopping],epochs=20)
    print()
    print('training model 2')
    
    history72 = model.fit(train_windows72,train_labels72,validation_data=(val_windows72,val_labels72),
                          callbacks=[early_stopping],epochs=20)
    print()
    print('training model 3')
    
    history120 = model.fit(train_windows120,train_labels120,validation_data=(val_windows120,val_labels120),
                          callbacks=[early_stopping],epochs=20)
    
    return history24,history72,history120

In [20]:
linear_history24,linear_history72,linear_history120 = compile_and_fit(linear)

training model 1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20

training model 2
Epoch 1/20
Epoch 2/20
Epoch 3/20

training model 3
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [21]:
linear_loss24,linear_test24 = linear.evaluate(test_windows24,test_labels24)
linear_loss72,linear_test72 = linear.evaluate(test_windows72,test_labels72)
linear_loss120,linear_test120 = linear.evaluate(test_windows120,test_labels120)



## Dense Model

In [22]:
dense_model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=32, activation='relu'),
    tf.keras.layers.Dense(units=32, activation='relu'),
    tf.keras.layers.Dense(units=1),
])

In [23]:
dense_history24,dense_history72,dense_history120 = compile_and_fit(dense_model)

training model 1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

training model 2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20

training model 3
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [24]:
dense_loss24,dense_test24 = dense_model.evaluate(test_windows24,test_labels24)
dense_loss72,dense_test72 = dense_model.evaluate(test_windows72,test_labels72)
dense_loss120,dense_test120 = dense_model.evaluate(test_windows120,test_labels120)



## LSTM Model

In [25]:
lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(32, return_sequences=False),
    tf.keras.layers.Dense(units=1)
])

In [26]:
compile_and_fit(lstm_model)

training model 1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

training model 2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

training model 3
Epoch 1/20
Epoch 2/20
Epoch 3/20


(<tensorflow.python.keras.callbacks.History at 0x1b48b742c88>,
 <tensorflow.python.keras.callbacks.History at 0x1b4c2bff348>,
 <tensorflow.python.keras.callbacks.History at 0x1b6d2d9e988>)

In [27]:
lstm_loss24,lstm_test24 = lstm_model.evaluate(test_windows24,test_labels24)
lstm_loss72,lstm_test72 = lstm_model.evaluate(test_windows72,test_labels72)
lstm_loss120,lstm_test120 = lstm_model.evaluate(test_windows120,test_labels120)



# Multi-Step Prediction

## Data Set Creation

In [28]:
# Function to create windows and labels
def create_dataset_multiple(train_df,window_size,label_size):
    train_windows = []
    train_labels = []
    
#     train_df = train_df.to_numpy()

    for i in range(len(train_df)-window_size-label_size):
        train_windows.append(train_df.iloc[i:i+window_size])
        train_labels.append(train_df['temperature'].iloc[i+window_size:i+label_size+window_size])
    
    return np.array(train_windows),np.array(train_labels)

In [29]:
#Train, val, test of size (24,24), (72,72), (120,120)
train_windows_multiple24, train_labels_multiple24 = create_dataset_multiple(train_df,24,24)
train_windows_multiple72, train_labels_multiple72 = create_dataset_multiple(train_df,72,72)
train_windows_multiple120, train_labels_multiple120 = create_dataset_multiple(train_df,120,120)


val_windows_multiple24, val_labels_multiple24 = create_dataset_multiple(val_df,24,24)
val_windows_multiple72, val_labels_multiple72 = create_dataset_multiple(val_df,72,72)
val_windows_multiple120, val_labels_multiple120 = create_dataset_multiple(val_df,120,120)

test_windows_multiple24, test_labels_multiple24 = create_dataset_multiple(test_df,24,24)
test_windows_multiple72, test_labels_multiple72 = create_dataset_multiple(test_df,72,72)
test_windows_multiple120, test_labels_multiple120 = create_dataset_multiple(test_df,120,120)

## Multi Baseline Model (Repeated)

In [30]:
def baseline_multi(data):
    return data[:,:,0]
    
baseline_multi24 = mean_absolute_error(baseline_multi(test_windows_multiple24),test_labels_multiple24)
baseline_multi72 = mean_absolute_error(baseline_multi(test_windows_multiple72),test_labels_multiple72)
baseline_multi120 = mean_absolute_error(baseline_multi(test_windows_multiple120),test_labels_multiple120)

baseline_multi24,baseline_multi72,baseline_multi120

(0.21222301797066634, 0.39038818488771215, 0.4387629334710285)

## Multi Linear Model

In [31]:
multi_linear_model24 = tf.keras.Sequential([
    tf.keras.layers.Lambda(lambda x: x[:, -1:, :]),
    tf.keras.layers.Dense(24),
])

multi_linear_model72 = tf.keras.Sequential([
    tf.keras.layers.Lambda(lambda x: x[:, -1:, :]),
    tf.keras.layers.Dense(72),
])

multi_linear_model120 = tf.keras.Sequential([
    tf.keras.layers.Lambda(lambda x: x[:, -1:, :]),
    tf.keras.layers.Dense(120),
])

In [32]:
#Compile and Fit function for multi step models
def compile_and_fit_multi(model,tw,tl,vw,vl):
    model.compile(loss=tf.losses.MeanSquaredError(),metrics=[tf.metrics.MeanAbsoluteError()])
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=2, mode='min')
    model.fit(tw,tl,validation_data=(vw,vl), callbacks=[early_stopping],epochs=20)

In [33]:
compile_and_fit_multi(multi_linear_model24,
                      train_windows_multiple24,train_labels_multiple24,
                      val_windows_multiple24,val_labels_multiple24)
print()
compile_and_fit_multi(multi_linear_model72,
                      train_windows_multiple72,train_labels_multiple72,
                      val_windows_multiple72,val_labels_multiple72)
print()
compile_and_fit_multi(multi_linear_model120,
                      train_windows_multiple120,train_labels_multiple120,
                      val_windows_multiple120,val_labels_multiple120)

Epoch 1/20
Epoch 2/20
Epoch 3/20

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [34]:
multi_linear_loss24,multi_linear_test24 = multi_linear_model24.evaluate(test_windows_multiple24,test_labels_multiple24)
multi_linear_loss72,multi_linear_test72 = multi_linear_model72.evaluate(test_windows_multiple72,test_labels_multiple72)
multi_linear_loss120,multi_linear_test120 = multi_linear_model120.evaluate(test_windows_multiple120,test_labels_multiple120)



## Multi LSTM Model

In [35]:
multi_lstm_model24 = tf.keras.Sequential([
    tf.keras.layers.LSTM(32, return_sequences=False),
    tf.keras.layers.Dense(units = 24)
])

multi_lstm_model72 = tf.keras.Sequential([
    tf.keras.layers.LSTM(32, return_sequences=False),
    tf.keras.layers.Dense(units = 72)
])

multi_lstm_model120 = tf.keras.Sequential([
    tf.keras.layers.LSTM(32, return_sequences=False),
    tf.keras.layers.Dense(units = 120)
])


In [36]:
compile_and_fit_multi(multi_lstm_model24,
                      train_windows_multiple24,train_labels_multiple24,
                      val_windows_multiple24,val_labels_multiple24)
print()
compile_and_fit_multi(multi_lstm_model72,
                      train_windows_multiple72,train_labels_multiple72,
                      val_windows_multiple72,val_labels_multiple72)
print()
compile_and_fit_multi(multi_lstm_model120,
                      train_windows_multiple120,train_labels_multiple120,
                      val_windows_multiple120,val_labels_multiple120)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


In [37]:
multi_lstm_loss24,multi_lstm_test24 = multi_lstm_model24.evaluate(test_windows_multiple24,test_labels_multiple24)
multi_lstm_loss72,multi_lstm_test72 = multi_lstm_model72.evaluate(test_windows_multiple72,test_labels_multiple72)
multi_lstm_loss120,multi_lstm_test120 = multi_lstm_model120.evaluate(test_windows_multiple120,test_labels_multiple120)



# Results

In [38]:
single_index = ['Baseline(Last Step)','Linear','Dense','LSTM']
columns = ['Window Size 24','Window Size 72','Window Size 120']
scores24 = [baseline_test24,linear_test24,dense_test24,lstm_test24]
scores72 = [baseline_test72,linear_test72,dense_test72,lstm_test72]
scores120 = [baseline_test120,linear_test120,dense_test120,lstm_test120]

singlestep = pd.DataFrame([scores24,scores72,scores120],index=columns,columns=single_index).transpose()

heading_properties = [('font-size', '18px')]

cell_properties = [('font-size', '16px')]

dfstyle = [dict(selector="th", props=heading_properties),\
 dict(selector="td", props=cell_properties)]
singlestep.style.set_table_styles(dfstyle)

Unnamed: 0,Window Size 24,Window Size 72,Window Size 120
Baseline(Last Step),0.131695,0.131968,0.131428
Linear,0.518141,0.532831,0.54192
Dense,0.529515,0.541989,0.548427
LSTM,0.078646,0.079106,0.078998


In [39]:
multi_index = ['Baseline(Repeat)','Linear','LSTM']
columns_m = ['Window and Label Size 24','Window and Label Size 72','Window and Label Size 120']
scores_m24 = [baseline_multi24,multi_linear_test24,multi_lstm_test24]
scores_m72 = [baseline_multi72,multi_linear_test72,multi_lstm_test72]
scores_m120 = [baseline_multi120,multi_linear_test120,multi_lstm_test120]

multistep = pd.DataFrame([scores_m24,scores_m72,scores_m120],index=columns_m,columns=multi_index).transpose()

multistep.style.set_table_styles(dfstyle)

Unnamed: 0,Window and Label Size 24,Window and Label Size 72,Window and Label Size 120
Baseline(Repeat),0.212223,0.390388,0.438763
Linear,0.80318,0.808032,0.814345
LSTM,0.221473,0.292976,0.352899


In [89]:
kaggleset = pd.read_csv('ps6_test.csv')
#Joining wind direction and wind speed to wind vector
wd_rad = kaggleset.wind_direction*np.pi/180
kaggleset['Wx'] = kaggleset.wind_speed*np.cos(wd_rad)
kaggleset['Wy'] = kaggleset.wind_speed*np.sin(wd_rad)

kaggleset = kaggleset.drop(['wind_direction','wind_speed'],axis=1)

kaggleset['datetime'] = pd.to_datetime(kaggleset['datetime'])
timestamp_h1 = kaggleset['datetime'].map(datetime.datetime.timestamp)/3600

day = 24
year = (365.2425)*day

kaggleset['Day sin'] = np.sin(timestamp_h1 * (2 * np.pi / day))
kaggleset['Day cos'] = np.cos(timestamp_h1 * (2 * np.pi / day))
kaggleset['Year sin'] = np.sin(timestamp_h1 * (2 * np.pi / year))
kaggleset['Year cos'] = np.cos(timestamp_h1 * (2 * np.pi / year))

kaggleset = kaggleset.drop(['weather','datetime'],axis=1)

for i in range(120):
    kaggleset['temperature'].iloc[i] = (float(kaggleset['temperature'].iloc[i]) - train_mean['temperature']) / train_std['temperature']

for col in ['humidity','pressure','Wx','Wy','Day sin','Day cos','Year sin','Year cos']:
    kaggleset[col] = (kaggleset[col] - train_mean[col])/train_std[col]

kaggleset['temperature'] = kaggleset['temperature'].replace('?',-9999)
kaggleset

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,temperature,humidity,pressure,Wx,Wy,Day sin,Day cos,Year sin,Year cos
0,0.861852,-1.602703,-0.480302,0.091631,-1.300145,0.999934,-0.999895,-0.949455,0.974841
1,0.436449,-1.420340,-0.480302,-0.276444,-1.276811,0.707021,-1.224618,-0.948693,0.975512
2,0.086845,-1.648294,-0.250485,0.091631,-0.532188,0.365917,-1.365884,-0.947931,0.976183
3,-0.009064,0.494472,-0.250485,0.091631,0.235769,-0.000133,-1.414068,-0.947168,0.976854
4,-0.173037,-0.782069,-0.250485,1.035947,0.584414,-0.366183,-1.365884,-0.946405,0.977524
...,...,...,...,...,...,...,...,...,...
235,-9999.000000,-1.693885,-0.250485,1.093720,0.485791,1.224694,0.707043,-0.758388,1.116554
236,-9999.000000,-1.830657,-0.250485,1.093720,0.485791,1.365984,0.365995,-0.757527,1.117085
237,-9999.000000,-1.739475,-0.365393,1.093720,0.485791,1.414175,0.000006,-0.756666,1.117615
238,-9999.000000,-2.149792,-0.250485,-1.900199,-0.289545,1.365984,-0.365983,-0.755804,1.118145


In [90]:
def pred(window):
    return lstm_model.predict(window.to_numpy().reshape(-1,120,9).astype('float64'))[0][0]

In [91]:
for i in range(120):
    value = pred(kaggleset[i:i+120])
#     print(kaggleset[i:i+120])
    kaggleset.at[i+120, 'temperature'] = value

In [94]:
ans = kaggleset['temperature'][120:]

In [104]:
k = pd.DataFrame(ans*train_std['temperature'] + train_mean['temperature']).reset_index()
k = k.drop('index',axis=1).reset_index()
k.columns=['Id','Predicted']

k

Unnamed: 0,Id,Predicted
0,0,299.188582
1,1,297.590463
2,2,296.193212
3,3,294.973288
4,4,293.989738
...,...,...
115,115,295.339291
116,116,296.421828
117,117,296.952904
118,118,296.937486


In [105]:
k.to_csv('lstm_single.csv',index=False)