# First Tensorflow Model w/o Preprocessing

*Notes:*

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

### Load data from csv

In [16]:
building_df = pd.read_csv("../data/raw/building_metadata.csv")
weather_train = pd.read_csv("../data/raw/weather_train.csv")
train = pd.read_csv("../data/raw/train.csv")

### Join the single dataframes

In [17]:
train = train.merge(building_df, left_on = "building_id", right_on = "building_id", how = "left")
train = train.merge(weather_train, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"])
del weather_train

### Introduce function for reducing in-memory size of data set

In [18]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [19]:
train = reduce_mem_usage(train)

Mem. usage decreased to 1036.44 Mb (60.3% reduction)


#### Which columns are in the set?

In [20]:
train.columns

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed'],
      dtype='object')

#### Isolate timestamp elements

*Note: I didn't add year as single feature as it is not recurrent and not useful in future implementations*

In [21]:
train["timestamp"] = pd.to_datetime(train["timestamp"])
train["hour"] = train["timestamp"].dt.hour
train["weekday"] = train["timestamp"].dt.weekday
train["month"] = train["timestamp"].dt.month

del train["timestamp"]

#### Encode categorical data

*Note: nominal features should probably be one-hot-encoded. I got out of memory errors on the first try so maybe we are looking for better code/approach*

In [22]:
nominal_cols = ["building_id", "meter", "site_id", "primary_use"]
ordinal_cols = ["hour", "weekday", "month"]

#one_hot = OneHotEncoder(categorical_features=nominal_cols)
#train = one_hot.fit_transform(train)
train[ordinal_cols + nominal_cols] = train[ordinal_cols + nominal_cols].apply(LabelEncoder().fit_transform)

#### Substitute NaNs with 0

In [23]:
train = train.fillna(0)
train.isnull().sum().sum()

0

#### Normalize data

In [24]:
y_train = train["meter_reading"]
del train["meter_reading"]
train = StandardScaler().fit_transform(train)

In [25]:
train

array([[-1.86975219, -0.71071682, -1.56469446, ..., -1.66259743,
         0.49707282, -1.62524441],
       [-1.86740682, -0.71071682, -1.56469446, ..., -1.66259743,
         0.49707282, -1.62524441],
       [-1.86506146, -0.71071682, -1.56469446, ..., -1.66259743,
         0.49707282, -1.62524441],
       ...,
       [ 1.52164554, -0.71071682,  1.38097395, ...,  1.65927177,
         0.99787434,  1.57642821],
       [ 1.5239909 , -0.71071682,  1.38097395, ...,  1.65927177,
         0.99787434,  1.57642821],
       [ 1.52633627, -0.71071682,  1.38097395, ...,  1.65927177,
         0.99787434,  1.57642821]])

### Build Model

In [26]:
X_train, X_test, y_train, y_test = train_test_split(train, y_train, test_size=0.2, random_state=420)
del train

In [27]:
batch_size = 512
epochs = 10

In [35]:
model = Sequential()
model.add(Dense(64, input_shape=(X_train.shape[1],), activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(1))

In [36]:
model.compile(optimizer="adam",
              loss="mean_squared_logarithmic_error",
              metrics=["mean_squared_logarithmic_error"])

In [37]:
model.fit(X_train, y_train.values, batch_size=batch_size, epochs=epochs)

Train on 16100484 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f44b4bc3f90>

In [39]:
model.evaluate(X_test, y_test.values)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1.7194584851329462, 1.7194499]