# Reality Mining. Jupyter Notebook implementation


With scipy library realitymining mat file is red and saved in to data variable.
Since mat lab contains a lot of data, reading process take a lot of computer resources!

In [None]:
import scipy.io
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import math
from sklearn.neural_network import BernoulliRBM

data = scipy.io.loadmat("realitymining.mat")['s']

In [None]:
affilation = data['my_affil']
data_mat = data['data_mat']

# Restricted Boltzmann machine

### Data preprocessing.

Extract __data_mat__ elements for each subject, categorize subjects to __sloan__ and __no sloan__ and save index of subjects who belongs 'sloan' category.
<br>
Also __frequency features and labels lists__ are created for Multi-layer Perceptron.

In [None]:
sloan_list = []
features_list = []
frequency_list = []
observation_days = 7
count = 0
frequency_labels = []

for i in range(len(affilation[0])):
    if len(data_mat[0][i]) > 0  and len(affilation[0][i]) > 0:
        frequency_list += [data_mat[0][i]]
        if affilation[0][i][0][0][0] == 'sloan' or affilation[0][i][0][0][0] == 'sloan_2':
            frequency_labels += [1]
        else:
            frequency_labels += [0]
        if len(data_mat[0][i][0]) >= observation_days:
            features_list += [data_mat[0][i]]
            if affilation[0][i][0][0][0] == 'sloan' or affilation[0][i][0][0][0] == 'sloan_2':
                sloan_list += [count]
            count += 1

### Exclude NaN values.
Change all NaN values to value 4 in feature_list data.

In [None]:
for subject in range(len(features_list)):
    for hour in range(len(features_list[subject])):
        for element in range(len(features_list[subject][hour])):
            if math.isnan(features_list[subject][hour][element]):
                features_list[subject][hour][element] = 4

### Creating Features vector for training and testing.
<br>
__observation data__ -  value of how many days subject activity is stored. By default we have one week - 7 days.
<br>
__all_places__ - it is a list of all possible places in data_set :

 Value | Explanation
  -------------  | -------------
  0 | No signal
  1 | Home
  2 | Work
  3 | Elsewhere
  4 | Phone is off
<br>
__count__ - Current index of element position in features vector.
<br>
__features__ - 2-rd dimensional numpy N-darray for storing features for RBM training:
<br>
* 1-st dimension - Subject
* 2-nd dimension - Subject activity each hour of the week.



In [None]:
#  0 – no signal, 1 – home, 2 – work, 3 – elsewhere, 4 – phone is off

all_places = [0, 1, 2, 3, 4]
count = 0

features = np.zeros((len(features_list), observation_days * 24 * len(all_places)))

for subject in range(len(features_list)):
    for number in range(len(all_places)):
        for week in range(observation_days):
            for hours in range(24):
                if features_list[subject][hours][week] == all_places[number]:
                    features[subject][count] = 1
                count += 1
    count = 0

### RBM Neural Network implementation

In [None]:
rbm_list = []  
sk_rbm = BernoulliRBM(n_components=100, verbose=True, learning_rate=0.1, n_iter=1000)
sk_rbm.fit(features)
rbm_list = sk_rbm.transform(features)

### Labels creation for Multi-layer perceptron
Each subject is classified to sloan or no_sloan.
<br>
If subject belongs to __sloan__ cateogy, his label value is equal __1__.
<br>
If subject belongs to __no sloan__ category, his label value is equal __0__.

In [None]:
labels = []
for i in range(len(features)):
    if i in sloan_list:
        labels += [1]
    else:
        labels += [0]
        
rbm_labels = np.array(labels)

### Frequency features creation for Multi-layer perceptron
Frequencies of subjects calculated by summing locations categories for each type and sum dividing by all possible activities in the day.

In [None]:
# 1 – home, 2 – work, 3 – elsewhere, 0 – no signal, 4 – phone is off
home, work, elsewhere, no_signal, phone_off = 0, 0, 0, 0, 0
frequency = []
temp = []
for subject in range(len(frequency_list)):
    for hours in range(24):
        for elements in range(len(frequency_list[subject][hours])):
            if frequency_list[subject][hours][elements] == 1:
                home += 1
            elif frequency_list[subject][hours][elements] == 2:
                work += 1
            elif frequency_list[subject][hours][elements] == 3:
                elsewhere += 1
            elif frequency_list[subject][hours][elements] == 0:
                no_signal += 1
            else:
                phone_off += 1
        temp += [home/len(frequency_list[subject][hours]) if home !=0 else 0, 
                     work/len(frequency_list[subject][hours]) if  work !=0 else 0, 
                     elsewhere/len(frequency_list[subject][hours]) if elsewhere !=0 else 0, 
                     #no_signal/len(frequency_list[subject][hours]) if no_signal !=0 else 0,
                     phone_off/len(frequency_list[subject][hours]) if phone_off !=0 else 0]
        home, work, elsewhere, no_signal, phone_off = 0, 0, 0, 0, 0
    frequency += [temp]
    temp = []
frequency_features = np.array(frequency)

### RBM Neural Network implementation (second method)
Splitting RBM features and labels data to train data for training and test data for testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, rbm_labels, test_size=0.3, random_state=0)
rbm_list = []  
sk_rbm = BernoulliRBM(n_components=100, verbose=True, learning_rate=0.1, n_iter=1000)
sk_rbm.fit(X_train)
X_train = sk_rbm.transform(X_train)

Define parameters for MLP: 
* inputs - number of training features data.
* n_hiddens - number of hidden neurons.
* n_outputs - number of classifiers. This number is equal 2, because there are two labels: sloan and no_sloan.
* n_epochs - Number of epochs which is a measure of the number of times all of the training vectors are used once to update the 							weights.
* batch_size - defines number of samples that going to be propagated through the network.

In [None]:
# Parameters that define the MLP
n_inputs = len(X_train[0])
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 2
X = tf.placeholder(tf.float32, shape= (None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

Implying activation function on data.

In [None]:
def neuron_layer(X, n_neurons, name, activation=None):
    with tf.name_scope(name):
        # Number of inputs
        n_inputs = int(X.get_shape()[1])
        # This value is computed to randomly initialize the weights
        stddev = 2 / np.sqrt(n_inputs)
        # Weigths can be initialized in different ways
        # Here they are randomly initialized from a Normal distribution (mean=0,std as computed before)
        # Notice that weights are organized in a matrix (tensor) and its number is n_inputs*n_neurons
        init = tf.truncated_normal((n_inputs, n_neurons), stddev=stddev)
        # The variable that will contain the weights is W
        W = tf.Variable(init, name="kernel")
        
        # The variable that will contain the bias is b  
        # and is initialized to zero
        b = tf.Variable(tf.zeros([n_neurons]), name="bias")
        
        # As in the perceptron what the neurons do is multiply the weights by 
        # the input

        Z = tf.matmul(X, W) + b
        
        # What the activation function does is to "process" the result
        # of the multiplication of weights by inputs, and this is the output
        # of every neuron. 
    
        if activation is not None:
            return activation(Z)
        else:
            return Z

Define first and second layer using the __ReLu activation function__. Activation function is used to produce a non-linear decision boundary via non-linear combinations of the weighted inputs. __ReLu__ function is represented: __(X) = max(x,0)__ - if the input is greater than 0, the output is equal to the input.

In [None]:
# The scope name for this MLP is "dnn"
with tf.name_scope("dnn"):
    
    # The first hidden layer is defined using the RELU activation function
    # It will contain n_hidden1=300 hidden neurons and therefore output
    # 300 values    
    hidden1 = neuron_layer(X, n_hidden1, name="hidden1", activation=tf.nn.relu)

    # The second hidden layer is also defined using the RELU activation function
    # It will contain n_hidden2=100 hidden neurons and therefore output
    # 100 values    
    hidden2 = neuron_layer(hidden1, n_hidden2, name="hidden2", activation=tf.nn.relu)
    
    # The output layer does not use any activation function
    # it will output n_outputs=10 values since there are 10 classes in MNIST
    logits = neuron_layer(hidden2, n_outputs, name="outputs")

Define __loss function__. Loss function is a performance metric on how well the Neural Network manages to reach its goal of generating outputs as close as possible to the desired values.

In [None]:
# scope name of the loss function is "loss"
with tf.name_scope("loss"):
    
    # The Loss function is defined. It outputs a loss value for each x
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
#     xentropy = tf.contrib.keras.backend.binary_crossentropy(output=n_inputs, target = n_inputs, from_logits=False)
    
    # The total loss is mean of the loss values 
    loss = tf.reduce_mean(xentropy, name="loss")

Implementing __Gradient Descent Optimizer__ which updates the weights towards less and less global loss function.

In [None]:
learning_rate = 0.01

with tf.name_scope("train"):
    # The plain GradientDescentOptimizer is chosen
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

In [None]:
with tf.name_scope("eval"):
    # A prediction is correct if it is among the k=1 most probable
    # classes predicted by the NN. Since k=1, it is only correct
    # if the prediction coincides with the true class.
    correct = tf.nn.in_top_k(logits, y, 1)
    
    # The accuracy is the mean of correct predictions
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

Learn weights using current batch and Compute accuracies in the training and validation sets using tensorflow.

In [None]:
# Initialization of the computation graph
init = tf.global_variables_initializer()

# tensorflow allows to define a saver to store the model after learning
saver = tf.train.Saver()

# Number of epochs
n_epochs = 100

# Size of the batch used to update the gradient
batch_size = 50

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(70 // batch_size):
            
            #Function next_batch automatically select the batch
#             X_batch, y_batch = mnist.train.next_batch(batch_size)
            
            # Weights are learned using the current batch            
            sess.run(training_op, feed_dict={X: X_train, y: y_train})
            
        # Accuracies are computed in the training and validation sets    
        acc_train = accuracy.eval(feed_dict={X:  X_train, y: y_train})
        acc_val = accuracy.eval(feed_dict={X: sk_rbm.transform(X_test),
                                            y: y_test})
        print(epoch, "Train accuracy:", acc_train, "Val accuracy:", acc_val)

###  First method accuracy and training with Frequency MLP Neural networks using sklearn. 
Computes accuracy of frequently MLP for comparasion.

In [None]:
#Classificaton from RBM feature
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(frequency_features, frequency_labels, test_size=0.3, random_state=0)
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)


model = clf.fit(X_train, y_train)
predicted_labels = model.predict(X_test)
print("Accuracy %f" % metrics.accuracy_score(y_test, predicted_labels))