# Import software libraries and load the datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install visualizenn

[31mERROR: Could not find a version that satisfies the requirement visualizenn (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for visualizenn[0m[31m
[0m

In [None]:
import sys                             # Read system parameters.
import os                              # Interact with the operating system.
import numpy as np                     # Work with multi-dimensional arrays and matrices.
import pandas as pd                    # Manipulate and analyze data frames.
import sklearn                         # Perform feature engineering and machine learning.
from sklearn.utils import shuffle
import matplotlib                      # Create charts.
import matplotlib.pyplot as plt
import visualizenn as VisNN            # Create neural network visualizations. # Fixed import statement to match installed module name
from time import time                  # Calculate training time.

# Summarize software libraries used.
print('Libraries used in this project:')
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- scikit-learn {}'.format(sklearn.__version__))
print('- Matplotlib {}'.format(matplotlib.__version__))
print('- Python {}\n'.format(sys.version))

# Load the datasets.
DATA_PATH = os.path.join('.', 'occupancy_data')
print('Data files in this project:', os.listdir(DATA_PATH))

data_file_train_raw = os.path.join(DATA_PATH, 'occupancy_train.csv')
data_file_test_raw = os.path.join(DATA_PATH, 'occupancy_test.csv')
df_train = pd.read_csv(data_file_train_raw)
df_test = pd.read_csv(data_file_test_raw)
print('Loaded {} records from {}.'.format(len(df_train), data_file_train_raw))
print('Loaded {} records from {}.'.format(len(df_test), data_file_test_raw))

ModuleNotFoundError: No module named 'visualizenn'

# Get acquainted with the dataset #

In [None]:
# Shuffle the dataset.
df_train = shuffle(df_train.copy(), random_state = 765)
df_train.reset_index(inplace = True, drop = True)

df_test = shuffle(df_test.copy(), random_state = 765)
df_test.reset_index(inplace = True, drop = True)

print(df_train.info())
df_train.head(10)

# Examine the distributions of the features

In [None]:
df_train.hist(figsize = (12, 10), grid = False);

# Examine descriptive statistics

In [None]:
with pd.option_context('float_format', '{:.3f}'.format):
    display(df_train.describe())

# Split the label from the datasets

In [None]:
# Separate training and test sets already exist.

# Occupancy is the dependent variable (value to be predicted), so it will be
# removed from the training and testing data and put into a separate data frame for labels.

label_cols = ['Occupancy']

training_cols = ['Date', 'Temperature', 'RelativeHumidity', 'Light', 'CO2', 'HumidityRatio']

# Split the training and test datasets and their labels.
X_train, y_train = df_train[training_cols].copy(), df_train[label_cols].copy()
X_test, y_test = df_test[training_cols].copy(), df_test[label_cols].copy()

# Compare number of rows and columns in original data to training and testing sets.
print(f'Original set:      {df_train.append(df_test).shape}')
print('------------------------------')
print(f'Training features: {X_train.shape}')
print(f'Testing features:  {X_test.shape}')
print(f'Training labels:   {y_train.shape}')
print(f'Testing labels:    {y_test.shape}')

# Convert the `Date` column to datetime format for processing

In [None]:
X_train['Date'] = pd.to_datetime(X_train['Date'])
X_test['Date'] = pd.to_datetime(X_test['Date'])

X_train.head()

# Determine which datetime components have unique values

In [None]:
# Extract specific datetime components and retrieve unique values.
print('Unique years:   {}'.format(X_train['Date'].dt.year.unique()))
print('Unique months:  {}'.format(X_train['Date'].dt.month.unique()))
print('Unique days:    {}'.format(X_train['Date'].dt.day.unique()))
print('Unique hours:   {}'.format(X_train['Date'].dt.hour.unique()))
print('Unique minutes: {}'.format(X_train['Date'].dt.minute.unique()))
print('Unique seconds: {}'.format(X_train['Date'].dt.second.unique()))

# Split the relevant datetime features

In [None]:
def split_dt_features(dataset):

    # Retrieve days, hours, and minutes from timestamp.
    day = dataset['Date'].dt.day
    dataset['Day'] = day.astype('int64')

    hour = dataset['Date'].dt.hour
    dataset['Hour'] = hour.astype('int64')

    minute = dataset['Date'].dt.minute
    dataset['Minute'] = minute.astype('int64')

    return dataset

X_train = split_dt_features(X_train.copy())
X_test = split_dt_features(X_test.copy())

X_train.head()

# Drop the original `Date` column

In [None]:
# Date column been split into multiple columns.
print('Columns before drop:\n\n{}\n'.format(list(X_train.columns)))
X_train.drop('Date', axis = 1, inplace = True)
print('Columns after drop:\n\n{}\n'.format(list(X_train.columns)))

X_test.drop('Date', axis = 1, inplace = True)

# Standardize the features

In [None]:
from sklearn import preprocessing

def standardize(dataset):
    df_stand = dataset.copy()
    scaler = preprocessing.StandardScaler()

    df_stand[dataset.columns] = scaler.fit_transform(df_stand[dataset.columns])

    return df_stand

X_train = standardize(X_train)
X_test = standardize(X_test)

print('The features have been standardized.')

In [None]:
with pd.option_context('float_format', '{:.2f}'.format):
    display(X_train.describe())

# Train an MLP model

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes = (2),
                    activation = 'relu',
                    solver = 'adam',
                    alpha = 0.0001,
                    learning_rate_init = 0.001,
                    max_iter = 500,
                    tol = 1e-4,
                    n_iter_no_change = 10,
                    verbose = True,
                    random_state = 87)

start = time()
mlp.fit(X_train, np.ravel(y_train))
end = time()
train_time = (end - start)

# Score using the test data.
score = mlp.score(X_test, y_test)

print('\nMLP model took {:.2f} seconds to fit.'.format(train_time))
print('Accuracy: {:.0f}%'.format(score * 100))

# Visualize the loss minimization through gradient descent

In [None]:
def plot_loss(model):
    plt.plot(model.loss_curve_)
    plt.title('GD Loss Minimization')
    plt.xlabel('Steps')
    plt.ylabel('Loss')

plot_loss(mlp)

# Visualize the neural network architecture

In [None]:
def nn_diagram(X, y, model, show_weights):
    '''Generates structure of network from dataset shapes and hidden layer sizes.'''

    nn_struct = np.hstack(([X.shape[1]],
                           np.asarray(model.hidden_layer_sizes),
                           [y.shape[1]]))

    # Only plot weights if specified.
    if show_weights == True:
        network = VisNN.DrawNN(nn_struct, model.coefs_)
    else:
        network = VisNN.DrawNN(nn_struct)

    network.draw()

nn_diagram(X_train, y_train, mlp, False)

# Retrieve the neuron weights and bias terms and redraw the network architecture

In [None]:
print('Weights between input layer and hidden layer:')
print(mlp.coefs_[0], '\n')
print('Weights between hidden layer and output layer:')
print(mlp.coefs_[1], '\n')
print('Bias terms between input layer and hidden layer:')
print(mlp.intercepts_[0], '\n')
print('Bias terms between hidden layer and output layer:')
print(mlp.intercepts_[1])

In [None]:
nn_diagram(X_train, y_train, mlp, True)

# Fit an MLP model using grid search with cross-validation

In [None]:
from sklearn.model_selection import GridSearchCV

mlp = MLPClassifier(alpha = 0.0001,
                    learning_rate_init = 0.001,
                    max_iter = 500,
                    tol = 1e-4,
                    n_iter_no_change = 10,
                    random_state = 87)

grid = {'hidden_layer_sizes': [(5), (6)],
        'activation': ['logistic', 'tanh', 'relu'],
        'solver': ['sgd', 'adam']}

search = GridSearchCV(mlp, param_grid = grid, scoring = 'accuracy', cv = 5)

start = time()
search.fit(X_train, np.ravel(y_train))
end = time()
train_time = (end - start)

print('Grid search took {:.2f} seconds to find an optimal fit.'.format(train_time))
print(search.best_params_)

In [None]:
score = search.score(X_test, y_test)

print('Accuracy: {:.0f}%'.format(score * 100))

# Visualize the loss minimization of the optimized model

In [None]:
plot_loss(search.best_estimator_)

# Visualize the network structure of the optimized model

In [None]:
nn_diagram(X_train, y_train, search.best_estimator_, True)

# Examine the model's predictions on the test set

In [None]:
# Show example predictions with the test data.
results = df_test.copy()
results['PredictedOccupancy'] = search.predict(X_test)

# Clarify ground truth column.
results.rename(columns = {'Occupancy': 'ActualOccupancy'}, inplace = True)

results.head(10)