**ASHRAE - Great Energy Predictor III | Kaggle Competition**


In [0]:
# Colab library to upload files to notebook
from google.colab import files
# Install Kaggle library
!pip install -q kaggle
# Upload the kaggle.json file
uploaded = files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle
!ls -l ~/.kaggle
!cat ~/.kaggle/kaggle.json
# Uploading the data 
!kaggle competitions download -c ashrae-energy-prediction


Saving kaggle.json to kaggle.json
kaggle.json
total 4
-rw------- 1 root root 67 Jan  2 21:21 kaggle.json
Downloading test.csv.zip to /content
 99% 165M/167M [00:00<00:00, 169MB/s]
100% 167M/167M [00:00<00:00, 191MB/s]
Downloading weather_train.csv.zip to /content
  0% 0.00/1.27M [00:00<?, ?B/s]
100% 1.27M/1.27M [00:00<00:00, 180MB/s]
Downloading train.csv.zip to /content
 86% 103M/120M [00:00<00:00, 131MB/s] 
100% 120M/120M [00:00<00:00, 204MB/s]
Downloading sample_submission.csv.zip to /content
 81% 72.0M/88.4M [00:00<00:00, 179MB/s]
100% 88.4M/88.4M [00:00<00:00, 175MB/s]
Downloading weather_test.csv.zip to /content
  0% 0.00/2.53M [00:00<?, ?B/s]
100% 2.53M/2.53M [00:00<00:00, 83.4MB/s]
Downloading building_metadata.csv to /content
  0% 0.00/44.5k [00:00<?, ?B/s]
100% 44.5k/44.5k [00:00<00:00, 39.7MB/s]


**Import Libraries**

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

**Import Data**

In [0]:
building_df = pd.read_csv("building_metadata.csv")
weather_train = pd.read_csv('weather_train.csv.zip', compression='zip', header=0, sep=',', quotechar='"')
train = pd.read_csv('train.csv.zip', compression='zip', header=0, sep=',', quotechar='"')

**Data** **Preperation**

In [0]:
#Merge 
train = train.merge(building_df, left_on = "building_id", right_on = "building_id", how = "left")
train = train.merge(weather_train, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"])
del weather_train

In [0]:
#reduce in memory size of data set

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
train = reduce_mem_usage(train)

Mem. usage decreased to 1036.44 Mb (60.3% reduction)


In [0]:
train.columns

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed'],
      dtype='object')

In [0]:
#year is not useful 

train["timestamp"] = pd.to_datetime(train["timestamp"])
train["hour"] = train["timestamp"].dt.hour
train["weekday"] = train["timestamp"].dt.weekday
train["month"] = train["timestamp"].dt.month

del train["timestamp"]

In [0]:
#one hot  encoding 

nominal_cols = ["building_id", "meter", "site_id", "primary_use"]
ordinal_cols = ["hour", "weekday", "month"]
train[ordinal_cols + nominal_cols] = train[ordinal_cols + nominal_cols].apply(LabelEncoder().fit_transform)

In [0]:
#substitute nans with 0

train = train.fillna(0)
train.isnull().sum().sum()

0

In [0]:
#Normalize data
#Meter reading has a huge values

y_train = train["meter_reading"]

del train["meter_reading"]

train = StandardScaler().fit_transform(train)

In [0]:
train

array([[-1.86975219, -0.71071682, -1.56469446, ..., -1.66259743,
         0.49707282, -1.62524441],
       [-1.86740682, -0.71071682, -1.56469446, ..., -1.66259743,
         0.49707282, -1.62524441],
       [-1.86506146, -0.71071682, -1.56469446, ..., -1.66259743,
         0.49707282, -1.62524441],
       ...,
       [ 1.52164554, -0.71071682,  1.38097395, ...,  1.65927177,
         0.99787434,  1.57642821],
       [ 1.5239909 , -0.71071682,  1.38097395, ...,  1.65927177,
         0.99787434,  1.57642821],
       [ 1.52633627, -0.71071682,  1.38097395, ...,  1.65927177,
         0.99787434,  1.57642821]])

**Model**

In [0]:
#build model

X_train, X_test, y_train, y_test = train_test_split(train, y_train, test_size=0.2, random_state=420)

del train

In [0]:
batch_size = 512 
epochs = 10

In [0]:
model = Sequential()
model.add(Dense(64, input_shape=(X_train.shape[1],), activation="relu")) #input_shape = nb de colonnes de train
model.add(Dense(32, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(1))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
model.compile(optimizer="adam",
              loss="mean_squared_logarithmic_error",
              metrics=["mean_squared_logarithmic_error"])  

In [0]:
model.fit(X_train, y_train.values, batch_size=batch_size, epochs=epochs)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 16100484 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7faef8861dd8>

In [0]:
model.evaluate(X_test, y_test.values)



[1.6840997157663447, 1.6840922]