In [1]:
import csv
import numpy as np
import pandas as pd

## Notifications through IFTTT

In [2]:
import requests
def send_notification(message):
    IFTTT_WEBHOOKS_URL = 'https://maker.ifttt.com/trigger/PyCheckpoint/with/key/c3GzUla4l68wv5XZaYpdMK'
    data = {'value1': message}
    #ifttt_event_url = IFTTT_WEBHOOKS_URL.format(event)
    requests.post(IFTTT_WEBHOOKS_URL, json=data)

### Importing the data file

In [3]:
# Importing the file
parcelMain = pd.read_csv('ParcelData_Enriched_HistoryJoined.csv', low_memory=False)
print("Number of columns = "+str(len(parcelMain.columns)))
print("Number of rows = "+str(len(parcelMain)))

# Marking all irrelevant fields in the csv
# Indexes are 0 to n-1
irrelevantFields = [0,1,7,23,24,25,38,43,44,45,46,47,48,49,50,51,52,53,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,104,107]
classCodes = [81,82,83,84,85,86,87,88]
assessments = [89,90,91,92,93,94,95,96]
vacantFlags = [97,98,99,100,101]
roomCount = [109]
assessmentDouble = [114]

demographics = [26,27,28,29,30,31,32,33,34,35,36,37]
yToPredict = [115]
XYCoords = [116,117]

Number of columns = 118
Number of rows = 65540


### Finding all parcels that were vacant at any point

In [4]:
# Vacant lots/building codes used by the city
vs = [300,311,312,314,315,320, 321, 322, 323, 330, 331, 340, 341, 350, 351, 352, 380, 399, 313, 316, 0]

In [5]:
from math import isnan
nvParcels = parcelMain
classes = ['Class1990','Class1996','Class2000','Class2004','Class2008','Class2011','Class2014','Class2018']
indexes = []
ind = 0
for ix, row in nvParcels.iterrows():
    rowClass = []
    for clx in classes:
        rowClass.append(np.isnan(row[clx]) or int(row[clx]) in vs)
    if all(rowClass):
        indexes.append(ix)
nvParcels.drop(indexes, inplace=True)
print("Dropped "+str(len(indexes))+" rows")

Dropped 2950 rows


## Drop irrelevant columns

Note: Column indices will have changed after running the next cells.

In [6]:
# DO NOT RUN AGAIN - Unless you have reloaded the csv file.
# Dropping irrelevant columns
irx = irrelevantFields+classCodes+assessments+vacantFlags+roomCount+assessmentDouble
idx = np.r_[irx]
nvParcels.drop(nvParcels.columns[idx], axis=1, inplace=True)
nvParcels.dropna(inplace=True)
print("Number of remaining columns = "+str(len(nvParcels.columns)))
print("Number of remaining rows = "+str(len(nvParcels)))
print(nvParcels.dtypes)

Number of remaining columns = 53
Number of remaining rows = 61578
propCrimeCount                  int64
violCrime                       int64
allVacantSqFt                 float64
nonIndVacantSqFt              float64
nonIndComVacantSqFt           float64
PropCrime2012Count              int64
PropCrime2013Count              int64
PropCrime2014Count              int64
PropCrime2015Count              int64
PropCrime2016Count              int64
PropCrime2017Count              int64
ViolentCrime2012Count           int64
ViolentCrime2013Count           int64
ViolentCrime2014Count           int64
ViolentCrime2015Count           int64
ViolentCrime2016Count           int64
ViolentCrime2017Count           int64
parcelAreaSqFt                float64
propertyCount                   int64
TotalCrime                      int64
TOTPOP00                        int64
TOTPOP10                        int64
TOTPOP_CY                       int64
DIVINDX_CY                    float64
TOTHH00               

## Model 1: Simple NN

In [7]:
from time import time

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor

import matplotlib.pyplot as plt
import itertools
from sklearn import datasets
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
from keras.utils import to_categorical
from keras.callbacks import TensorBoard

Using TensorFlow backend.


In [8]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [9]:
# Prelim Configs
seed = 7
np.random.seed(seed)

In [10]:
# Make copies of original data
xOrig = nvParcels.iloc[:,0:-3].values
yOrig = nvParcels.loc[:,'AssessmentChange1814'].values

In [11]:
# For future reference
print("Number of features: "+str(len(xOrig[1])))

Number of features: 50


In [12]:
# Scale all data between -1,1
sc2 = MinMaxScaler(copy=True, feature_range=(-1,1))
y = sc2.fit_transform(yOrig.reshape(-1,1))
x = sc2.transform(xOrig)

#### Methods for plotting errors/history

In [26]:
def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
    val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]
    
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    
    ## As loss always exists
    epochs = range(1,len(history.history[loss_list[0]]) + 1)
    
    ## Loss
    plt.figure(1)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    ## Accuracy
    plt.figure(2)
    for l in acc_list:
        plt.plot(epochs, history.history[l], 'b', label='Training accuracy (' + str(format(history.history[l][-1],'.5f'))+')')
    for l in val_acc_list:    
        plt.plot(epochs, history.history[l], 'g', label='Validation accuracy (' + str(format(history.history[l][-1],'.5f'))+')')

    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [27]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        title='Normalized confusion matrix'
    else:
        title='Confusion matrix'

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    
## multiclass or binary report
## If binary (sigmoid output), set binary parameter to True
def full_multiclass_report(model,
                           x,
                           y_true,
                           classes,
                           batch_size=32,
                           binary=False):

    # 1. Transform one-hot encoded y_true into their class number
    if not binary:
        y_true = np.argmax(y_true,axis=1)
    
    # 2. Predict classes and stores in y_pred
    y_pred = model.predict_classes(x, batch_size=batch_size)
    
    # 3. Print accuracy score
    print("Accuracy : "+ str(accuracy_score(y_true,y_pred)))
    
    print("")
    
    # 4. Print classification report
    print("Classification Report")
    print(classification_report(y_true,y_pred,digits=5))    
    
    # 5. Plot confusion matrix
    cnf_matrix = confusion_matrix(y_true,y_pred)
    print(cnf_matrix)
    plot_confusion_matrix(cnf_matrix,classes=classes)

#### Building the regressor

In [13]:
# Hyperparameters
parameters = {
        'batch_size': [100],
        'epochs': [500],
        'optimizer': ['adam'],
        'activation': ['tanh']
    }

In [14]:
# K-Fold
kf = KFold(n_splits=5)
history = {}
trainNumber = 0

In [None]:
# THE TRAINING

#Logging using tensorboard
for batch_size in parameters['batch_size']:
    for epochs in parameters['epochs']:
        for optimizer in parameters['optimizer']:
            for activation in parameters['activation']:
                trainErrors = []
                testErrors = []
                for i in range(0,5):
                    regressor = Sequential()
                    for k in range(5,51,5):
                        regressor.add(Dense(k, kernel_initializer='uniform', activation=activation, input_dim=len(xOrig[0])))
                        for j in range(0,i):
                            regressor.add(Dense(k, kernel_initializer='uniform', activation=activation))
                        regressor.add(Dense(1, kernel_initializer='uniform', activation=activation))
                        regressor.compile(loss='mean_squared_error', optimizer=optimizer)
                        for train_index, test_index in kf.split(x):
                            xTrain, xTest = x[train_index], x[test_index]
                            yTrain, yTest = y[train_index], y[test_index]
                            print("Running iteration "+str(trainNumber+1))
                            tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
                            history[trainNumber] = {}
                            history[trainNumber]['model'] = regressor.fit(xTrain, yTrain, epochs = epochs, batch_size = batch_size, verbose=0, validation_data=(xTest,yTest), callbacks=[tensorboard])
                            history[trainNumber]['params'] = {
                                'batch_size': batch_size,
                                'epochs': epochs,
                                'optimizer': optimizer,
                                'activation': activation,
                                'hidden-layers': i,
                                'nodes': k
                            }
                            trainNumber+=1


In [None]:
print(history)

### Growing model

In [None]:
##
## BEWARE: TAKES A LOT OF TIME TO RUN (~ Weeks)
##

from math import ceil
import json

# Hyperparameters
parameters = {
    'batch_size': [100],
    'epochs': [500],
    'optimizer': ['adam', 'sgd'],
    'activation': ['linear', 'tanh', 'relu']
}

# History log
history = {}
trainNumber = 0
modelHistory = {}
model = ""
try:
    def createmodel(num_nodes, activation, optimizer, input_dim):
        global model
        model = Sequential()
        if num_nodes < 50:
            model.add(Dense(num_nodes+1, kernel_initializer='uniform', activation=activation, input_dim=input_dim))
        elif num_nodes < 100:
            model.add(Dense(50, kernel_initializer='uniform', activation=activation, input_dim=input_dim))
            model.add(Dense(num_nodes%50+1, kernel_initializer='uniform', activation=activation))
        elif num_nodes < 150:
            model.add(Dense(50, kernel_initializer='uniform', activation=activation, input_dim=input_dim))
            model.add(Dense(50, kernel_initializer='uniform', activation=activation))
            model.add(Dense(num_nodes%100+1, kernel_initializer='uniform', activation=activation))
        model.add(Dense(1, kernel_initializer='uniform', activation=activation))
        model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse', 'mae', 'mape', 'cosine'])

    # THE TRAINING
    send_notification("Starting Training...")
    for batch_size in parameters['batch_size']:
        if batch_size not in history:
            history[batch_size] = {}

        for epochs in parameters['epochs']:
            if epochs not in history[batch_size]:
                history[batch_size][epochs] = {}

            for optimizer in parameters['optimizer']:
                if optimizer not in history[batch_size][epochs]:
                    history[batch_size][epochs][optimizer] = {}

                for activation in parameters['activation']:
                    if activation not in history[batch_size][epochs][optimizer]:
                        history[batch_size][epochs][optimizer][activation] = {}

                    for growth_index in range(0,150):
                        createmodel(growth_index, activation, optimizer, len(xOrig[0]))

                        layers = int(ceil((growth_index+1) / 50.0))
                        
                        layer_encoded = str(layers)+"_"+str((growth_index%50)+1)
                        
                        if layer_encoded not in history[batch_size][epochs][optimizer]:
                            history[batch_size][epochs][optimizer][activation][layer_encoded] = {}
                        
                        kf = KFold(n_splits=10)

                        for train_index, test_index in kf.split(x):
                            xTrain, xTest = x[train_index], x[test_index]
                            yTrain, yTest = y[train_index], y[test_index]
                            print("Running iteration "+str(trainNumber+1))
                            history[batch_size][epochs][optimizer][activation][layer_encoded][trainNumber] = {}
                            
                            # Logging
                            LOG_DIR = "growthLogs/batch_"+str(batch_size)+"/epoch_"+str(epochs)+"/optimizer_"+str(optimizer)+"/epoch_"+str(activation)+"/activation_"+str(batch_size)+"/Layers_"+str(layers)+"_nodes_"+str((growth_index%50)+1)+"/{}"
                            tensorboard = TensorBoard(log_dir=LOG_DIR.format(time()))
                            
                            modelHistory[trainNumber] = model.fit(xTrain, yTrain, epochs = epochs, batch_size = batch_size, verbose=0, validation_data=(xTest,yTest), callbacks=[tensorboard])
                            for key in modelHistory[trainNumber].history.keys():
                                history[batch_size][epochs][optimizer][activation][layer_encoded][trainNumber][key] = modelHistory[trainNumber].history[key]
                            try:
                                history[batch_size][epochs][optimizer][activation][layer_encoded][trainNumber]['params'] = {
                                    'batch_size': batch_size,
                                    'epochs': epochs,
                                    'optimizer': optimizer,
                                    'activation': activation,
                                    'layers': layers,
                                    'nodes': (growth_index%50)+1
                                }
                            except Exception as e:
                                print("Not saved")
                                print(e)
                                raise
                            try:
                                with open('data.json', 'w') as outfile:
                                    json.dump(history, outfile)
                            except Exception as e:
                                print(e)
                                print("File not saved")
                                raise
                            trainNumber+=1
                    send_notification("Training step "+str(trainNumber+1)+"...")
    send_notification("Training has completed...")
except Exception as e:
    send_notification("Training has crashed...")
    print(e)
    raise

#### Less Intensive Run

In [None]:
from math import ceil
import json

# Hyperparameters
parameters = {
    'batch_size': [100],
    'epochs': [500],
    'optimizer': ['adam'],
    'activation': ['tanh']
}

# History log
history = {}
trainNumber = 0
modelHistory = {}
model = ""
try:
    def createmodel(num_nodes, activation, optimizer, input_dim):
        global model
        model = Sequential()
        # 1 layer with 1 to 5 nodes
        if num_nodes < 5:
            model.add(Dense(num_nodes+1, kernel_initializer='uniform', activation=activation, input_dim=input_dim))
        model.add(Dense(1, kernel_initializer='uniform', activation=activation))
        model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse', 'mae', 'mape', 'cosine'])

    # THE TRAINING
    send_notification("Starting Training...")
    for batch_size in parameters['batch_size']:
        if batch_size not in history:
            history[batch_size] = {}

        for epochs in parameters['epochs']:
            if epochs not in history[batch_size]:
                history[batch_size][epochs] = {}

            for optimizer in parameters['optimizer']:
                if optimizer not in history[batch_size][epochs]:
                    history[batch_size][epochs][optimizer] = {}

                for activation in parameters['activation']:
                    if activation not in history[batch_size][epochs][optimizer]:
                        history[batch_size][epochs][optimizer][activation] = {}

                    for growth_index in range(0,5):
                        createmodel(growth_index, activation, optimizer, len(xOrig[0]))

                        #layers = int(ceil((growth_index+1) / 50.0))
                        
                        #layer_encoded = str(layers)+"_"+str((growth_index%50)+1)
                        layer_encoded = "1_"+str(growth_index+1)
                        
                        if layer_encoded not in history[batch_size][epochs][optimizer]:
                            history[batch_size][epochs][optimizer][activation][layer_encoded] = {}
                        
                        kf = KFold(n_splits=5)

                        for train_index, test_index in kf.split(x):
                            xTrain, xTest = x[train_index], x[test_index]
                            yTrain, yTest = y[train_index], y[test_index]
                            print("Running iteration "+str(trainNumber+1))
                            history[batch_size][epochs][optimizer][activation][layer_encoded][trainNumber] = {}
                            
                            # Logging
                            LOG_DIR = "growthLogs_low/batch_"+str(batch_size)+"/epoch_"+str(epochs)+"/optimizer_"+str(optimizer)+"/epoch_"+str(activation)+"/activation_"+str(batch_size)+"/Layers_1_nodes_"+str(growth_index+1)+"/{}"
                            tensorboard = TensorBoard(log_dir=LOG_DIR.format(time()))
                            
                            modelHistory[trainNumber] = model.fit(xTrain, yTrain, epochs = epochs, batch_size = batch_size, verbose=0, validation_data=(xTest,yTest), callbacks=[tensorboard])
                            for key in modelHistory[trainNumber].history.keys():
                                history[batch_size][epochs][optimizer][activation][layer_encoded][trainNumber][key] = modelHistory[trainNumber].history[key]
                            try:
                                history[batch_size][epochs][optimizer][activation][layer_encoded][trainNumber]['params'] = {
                                    'batch_size': batch_size,
                                    'epochs': epochs,
                                    'optimizer': optimizer,
                                    'activation': activation,
                                    'layers': 1,
                                    'nodes': growth_index+1
                                }
                            except Exception as e:
                                print("Not saved")
                                print(e)
                                raise
                            try:
                                with open('data.json', 'w') as outfile:
                                    json.dump(history, outfile)
                            except Exception as e:
                                print(e)
                                print("File not saved")
                                raise
                            trainNumber+=1
                    send_notification("Training step "+str(trainNumber+1)+"...")
    send_notification("Training has completed...")
except Exception as e:
    send_notification("Training has crashed...")
    print(e)
    raise

Running iteration 1
Running iteration 2
Running iteration 3
Running iteration 4


In [None]:
print(history)

## Model 2: NN with radial basis neurons

#### Radial Basis Neuron

In [350]:
from keras import backend as K
from keras.layers import Layer
import tensorflow as tf

class Radial1DTransformationLayer(Layer):

    def __init__(self, output_dim, radial_funct, **kwargs):
        self.output_dim = output_dim
        self.radial_funct = radial_funct
        self.basis_functions = {
            'fn1': lambda x,y : tf.exp(-tf.multiply(tf.square(x),tf.square(y))),
            'fn2': lambda x,y : tf.sqrt(tf.add(tf.ones([1,1],dtype=tf.float32),tf.multiply(tf.square(x),tf.square(y)))),
            'fn3': lambda x,y : tf.divide(tf.ones([1,1],tf.float32), tf.add(tf.ones([1,1],dtype=tf.float32),tf.multiply(tf.square(x),tf.square(y)))),
            'fn4': lambda x,y : tf.divide(tf.ones([1,1], tf.float32), tf.sqrt(tf.add(tf.ones([1,1],dtype=tf.float32),tf.multiply(tf.square(x),tf.square(y)))))
        }
        super(Radial1DTransformationLayer, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.kernel = self.add_weight(name='kernel', shape=(1,1), initializer='uniform', trainable=True)
        super(Radial1DTransformationLayer, self).build(input_shape)

    def call(self, x):
        return self.basis_functions[self.radial_funct](x, self.kernel)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)

##### Neuron Unit Test

In [333]:
from keras.layers import Input
from keras.models import Model

myinp = tf.constant([1.,2.,3.,4.,5.])
inp = Input(shape=myinp.shape)
out = Radial1DTransformationLayer(input_shape=(1,5), output_dim=(1, 5), radial_funct='fn1')(inp)
model = Model(input=inp, output=out)

output = model.predict(myinp, steps=1)
print(output)