In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import r2_score, explained_variance_score

from itertools import product

#import os
#import seaborn as sns
#import matplotlib.pyplot as plt
#%matplotlib inline

# Read Dataset

In [2]:
cmp = pd.read_csv("commViolUnnormData.txt", na_values='?')

In [3]:
# drop first non predictive features (communityname, state, countyCode, communityCode, "fold")
pred_features = cmp[cmp.columns[5:-18]]
regr_values = cmp[cmp.columns[-18:]]

# Drop features with a lot of missing values

In [4]:
print("Before dropping: {} features".format(str(pred_features.shape[1])))

#drop features that contain at least some threshold (from the total) of NaN values
cut_tresh = 0.75
to_drop = pred_features.columns[pred_features.count() < pred_features.shape[0]*cut_tresh]

pred_features = pred_features.drop(columns=to_drop)

print("After dropping: {} features".format(str(pred_features.shape[1])))

Before dropping: 124 features
After dropping: 102 features


# Imputing on features matrix

In [5]:
from collections import Counter

def value_withStrategy(v, strat):
    if strat == "mean":
        return np.mean(v)
    if strat == "median":
        return np.median(v)
    if strat == "most_frequent":
        return Counter(v).most_common(1)[0][0]
    print("Invalid imputing strategy!")
        
def imputing(df, strategy):
    nanRows, nanCols = np.where(df.isna())
    for j in nanCols:
        available = df.iloc[~nanRows, j]
        value = value_withStrategy(available, strategy)
        df.iloc[nanRows,j] = value

In [6]:
imputing(pred_features, "mean");

# Choose the Dependent Variable and drop possible missing values on it

In [7]:
def drop_sample(df, vals):
    idxRow = np.where(vals.isna())[0]
    return df.drop(index=idxRow).values, vals.drop(index=idxRow).values.reshape(-1,1)

In [8]:
data,values = drop_sample(pred_features, regr_values["robbPerPop"])

# Normalisation

In [9]:
def normalise(matrix, strat):
    for j in range(matrix.shape[1]):
        mi = np.min(matrix[:,j])
        ma = np.max(matrix[:,j])
        di = ma-mi
        if (di > 1e-6):
            if strat=="0_mean,1_std":
                matrix[:,j] = (matrix[:,j]-np.mean(matrix[:,j]))/np.std(matrix[:,j])
            elif strat=="[0,1]":
                matrix[:,j] = (matrix[:,j]-mi)/di
            elif strat=="[-1,1]":
                matrix[:,j] = 2*((matrix[:,j]-mi)/di)-1
            else:
                print("Invalid normalisation strategy!")
        else:
            matrix[:,j] = 0

In [10]:
strategy = "[-1,1]"
normalise(data,strategy)
normalise(values,strategy)

# Train-Test Split

In [11]:
n = data.shape[0]

trVl_Amount = int(n*0.7)
indexes = np.random.permutation(n)
idxTrVl = np.sort(indexes[0:trVl_Amount])
idxTs = np.sort(indexes[trVl_Amount:])

trainVal_data = data[idxTrVl]
test_data = data[idxTs]
trainVal_values = values[idxTrVl]
test_values = values[idxTs]

# Variable Selection

## 0. K-fold Cross Validation

In [12]:
from itertools import product

def kFold_crossValidation_selectionGrid(k, parameters_dict, train_data, train_values, predictor):
    nVal = train_data.shape[0]
    
    # Validation indexes adjustment
    elemPerFold, remainder = np.divmod(nVal,k)
    valIdxList = []
    start = 0

    for i in range(k):
        end = start+elemPerFold+int(remainder>0)
        valIdxList.append(np.arange(start,end)) 
        remainder -= 1
        start = end
    
    # Cross validation
    params_names = parameters_dict.keys()
    params_product = list(product(*parameters_dict.values()))
    val_results = np.empty((len(valIdxList),len(params_product)))
    
    for row, valIdx in enumerate(valIdxList):
        print("#{} fold:".format(row+1))
        for col, params in enumerate(params_product):
            update = col*100/len(params_product)
            print("\t["+"#"*(int(update/5))+" "*(int((100-update)/5))+"] {}%".format(update))
                     
            arg_dict = {k:v for k,v in zip(params_names,params)}
            
            predictor.fit(train_data[~valIdx], train_values[~valIdx], **arg_dict)
            pred = predictor.predict(train_data[valIdx])
            
            #val_results[row,col] = r2_score(trainVal_values[valIdx],pred)
            #val_results[row,col] = explained_variance_score(trainVal_values[valIdx],pred)
            val_results[row,col] = np.mean(np.square(train_values[valIdx]-pred))
            
    selected = np.argmin(val_results.mean(axis=0))
    return params_product[selected]

## 1. Matching Pursuit

### Project class definition

In [13]:
class matchingPursuit:
    def __init__(self, iterations, weights = None, indexes = None):
        self.iterations = iterations
        self.weights = weights
        self.indexes = indexes
        
    def fit(self, data_matrix, output_vect):
        residual = output_vect.copy()
        self.weights = np.zeros((data_matrix.shape[1], 1))
        self.indexes = []

        #data_2norm = np.sqrt(np.sum(np.square(data_matrix), axis=0))
        data_2norm = np.linalg.norm(data_matrix, ord=2, axis=0).reshape(1,-1)

        for i in range(self.iterations):
            projection = np.matmul(residual.T, data_matrix)
            k = np.argmax(np.divide(np.square(projection), data_2norm))
            self.indexes.append(k)

            distance = projection[0,k]/np.linalg.norm(data_matrix[:,k], ord=2)
            self.weights[k,0] += distance
            residual -= np.matmul(data_matrix, self.weights)

        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [14]:
mp = matchingPursuit(iterations=10)
mp.fit(trainVal_data, trainVal_values)
np.where(mp.weights)[0]

array([92])

In [15]:
pred = mp.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 2.162310179537521e+31
R^2 score: -5.9320785418722686e+32
Explained Variance Score: -4.0727187383408635e+30


### SkLearn Class

In [16]:
from sklearn.linear_model import orthogonal_mp
omp_coef = orthogonal_mp(trainVal_data, trainVal_values)
np.where(omp_coef)[0]

array([ 3, 11, 34, 38, 50, 67, 76, 92, 93, 94])

In [17]:
pred = np.matmul(test_data, omp_coef)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.07198140921010004
R^2 score: 0.6735424365120475
Explained Variance Score: 0.6774951217473056


## 2. L1 Penalty (Lasso)

### Project class definition

In [18]:
class lasso_regression:
    def __init__(self, iterations, weights=None):
        self.iterations = iterations
        self.weights = weights
        
    def fit(self, data_matrix, output_vect, _lambda):
        self.weights = np.zeros((data_matrix.shape[1],1))
        n = float(data_matrix.shape[0])
        step = n/(2*np.linalg.norm(np.matmul(data_matrix.T, data_matrix), ord=2))
        softTresh = step*_lambda

        for i in range(self.iterations):
            dist = np.matmul(data_matrix, self.weights) - output_vect
            coord_descent = (step/n)*np.matmul(data_matrix.T, dist)
            self.weights -= coord_descent

            upper = self.weights > softTresh
            lower = self.weights < -softTresh

            self.weights[upper] -= softTresh
            self.weights[lower] += softTresh
            self.weights[~upper & ~lower] = 0

        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [19]:
lr = lasso_regression(iterations=10)
lr.fit(trainVal_data, trainVal_values, 0.8)
np.where(lr.weights)[0]

array([  0,  10,  27,  49,  51,  71,  91,  92,  98, 101])

In [20]:
pred = lr.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.7113568063332496
R^2 score: -18.515352082219774
Explained Variance Score: 0.012067232341596346


### SkLearn Class

In [21]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.005)
lasso.fit(trainVal_data, trainVal_values)
np.where(lasso.coef_)[0]

array([  3,  11,  38,  44,  50,  76,  93,  94, 100])

In [22]:
pred = lasso.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.061549729003930895
R^2 score: 0.6735594634906952
Explained Variance Score: 0.6753372729897191


## 3. Random Forest

### Decision Tree project class definition

In [23]:
class NumericalDecisionTree_regressor:
    class Node:
        def __init__(self, value, isLeaf=False, feature=None, left=None, right=None):
            self.value = value
            self.isLeaf = isLeaf
            self.feature = feature
            self.left = left
            self.right = right

        def print_tree(self):
            if self.left: self.left.print_tree()
            print("Feature: {}, cut: {}\n".format(self.feature, self.value))
            if self.right: self.right.print_tree()

        def print_tree_indented(self, level=0):
            if self.right: self.right.print_tree_indented(level+1)
            print("|    "*level+"{} => {}".format(self.feature, self.value))
            if self.left: self.left.print_tree_indented(level+1)
            
    def __init__(self):
        self.root = None
        
    def fit(self, X, y, depth, minElem_perLeaf):
        self.root = self.learn(X, y, depth, minElem_perLeaf)
        return self
        
    def learn(self, X, y, depth, minElem_perLeaf):
        n, d = X.shape

        if depth==0 or n<=minElem_perLeaf: #or other condition
            return self.Node(value=np.mean(y), isLeaf=True)
            
        best_costDescent = 0 #split that maximise the error descent

        for i1 in range(d):
            sorted_idx = np.argsort(X[:,i1])
            sorted_x, sorted_y = X[sorted_idx, i1], y[sorted_idx]

            s_right, s_left = np.sum(sorted_y), 0
            n_right, n_left = n, 0

            for i2 in range(n-1):
                s_left += sorted_y[i2]
                s_right -= sorted_y[i2]
                n_left += 1
                n_right -= 1

                if sorted_x[i2]<sorted_x[i2+1]:
                    new_costDescent = (s_left**2)/n_left + (s_right**2)/n_right
                    if new_costDescent > best_costDescent:
                        best_costDescent = new_costDescent
                        best_feature = i1
                        best_cut = (sorted_x[i2]+sorted_x[i2+1])/2

        left_idxs = X[:,best_feature] < best_cut

        return self.Node(value=best_cut, feature=best_feature,
                        left=self.learn(X[left_idxs],y[left_idxs],depth-1,minElem_perLeaf),
                        right=self.learn(X[~left_idxs],y[~left_idxs],depth-1,minElem_perLeaf))
    
    def predict(self, X):
        if self.root is None:
            raise Exception("Tree not initialised! need to first fit the model")

        n = X.shape[0]
        y = np.empty(n)
        
        for i in range(n):
            current = self.root
            while not current.isLeaf:
                if X[i,current.feature] < current.value:
                    current = current.left
                else:
                    current = current.right
                
            y[i] = current.value
        
        return y
                
    def pprint(self):
        self.root.print_tree_indented()

In [24]:
ndt = NumericalDecisionTree_regressor()
ndt.fit(trainVal_data, trainVal_values, depth=5, minElem_perLeaf=10)
ndt.pprint()

|    |    |    None => 0.5100181968349873
|    |    100 => -0.33296521258972944
|    |    |    |    |    None => -0.19629075087463083
|    |    |    |    4 => -0.979453247431656
|    |    |    |    |    None => -0.6485360822920946
|    |    |    21 => -0.7930354381646628
|    |    |    |    None => 0.18936986833794878
|    50 => -0.04936014625228524
|    |    |    |    |    None => -0.28642691796750974
|    |    |    |    1 => -0.35326086956521746
|    |    |    |    |    None => 0.055184993794525926
|    |    |    89 => 0.12834224598930477
|    |    |    |    |    None => -0.3865909741146588
|    |    |    |    38 => 0.08746518105849588
|    |    |    |    |    None => -0.6006946008695025
|    |    50 => -0.47020109689213896
|    |    |    |    |    None => 0.011381855282161402
|    |    |    |    51 => -0.7704501056454165
|    |    |    |    |    None => -0.6379556784124296
|    |    |    50 => -0.6840950639853748
|    |    |    |    |    None => -0.6800242624466498
|    |    |    | 

In [25]:
pred = ndt.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.06835981935260961
R^2 score: 0.5773903356378863
Explained Variance Score: 0.5782774284717946


### Decision Tree SkLearn Class

In [26]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(trainVal_data, trainVal_values)
np.flip(np.argsort(dtr.feature_importances_))

array([ 49,  50, 100,  69,  89,  93,  41,   3,  20,  99,   2,  38,  75,
        48,  40,   8,  11,  67,   1,  24,  18,  52,  10,  44,  63,  19,
        36,  29,  60,  96,  56,  30,  73,  79,  77,  65,  37,  31,   6,
         9,  14,  12,  94,  35,  51,  55,  26,  95,  43,  34,  64,  88,
        47,  13,  16,  98,  72,  78,  54,   0,  62,  39,  80,  25,  74,
         7,  57,   5,  91,  17,  97,  87, 101,  23,  21,  68,  61,  15,
        86,  46,  45,  82,  81,   4,  92,  85,  22,  90,  27,  53,  58,
        76,  71,  33,  59,  42,  84,  83,  28,  32,  66,  70])

In [27]:
pred = dtr.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.07996859444280134
R^2 score: 0.4160187815952242
Explained Variance Score: 0.4172435215571928


### Random Forest project class definition

In [51]:
#https://stackoverflow.com/questions/18541923/what-is-out-of-bag-error-in-random-forests

class NumericalRandomForest_regressor:
    def __init__(self, n_trees):
        self.n_trees = n_trees
        self.trees = []
        self.boot_samplesIdxs = []
        self.oob_error = None
        
    def fit(self, X, y, depths, minElems_perLeaf):
        n, d = X.shape
        n_learn = int(n/3)
        
        val_folds = 3
        params_dict = {"depth":depths, "minElem_perLeaf":minElems_perLeaf}
        
        for i in range(self.n_trees):
            print("Fitting #{} tree".format(i+1))
            
            bootstrap_idxs = np.sort(np.random.permutation(n)[:n_learn])
            self.boot_samplesIdxs.append(bootstrap_idxs)
                        
            dt = NumericalDecisionTree_regressor()            
            win_params = kFold_crossValidation_selectionGrid(val_folds, params_dict, 
                                                             X[~bootstrap_idxs], y[~bootstrap_idxs], dt)
            self.trees.append(dt.fit(X[~bootstrap_idxs], y[~bootstrap_idxs],
                                     depth=win_params[0], minElem_perLeaf=win_params[1]))
        
        
        oob_errors = []
        for sampleIdx in range(n):
            missingBoot_TreesIdx = [idx for idx,bootstrap_idxs in enumerate(self.boot_samplesIdxs) 
                                    if sampleIdx not in bootstrap_idxs]

            if len(missingBoot_TreesIdx)==0: continue
            
            regr_results = np.empty(len(missingBoot_TreesIdx))
            for i, missing_tree in enumerate(missingBoot_TreesIdx):
                regr_results[i] = self.trees[missing_tree].predict(X[sampleIdx].reshape(1,-1))
                
            oob_errors.append(np.mean(np.square(np.mean(regr_results)-y[sampleIdx])))
            #oob_errors.append(r2_score(np.mean(regr_results),y[sampleIdx]))
            #oob_errors.append(explained_variance_score(np.mean(regr_results),y[sampleIdx]))
            
        self.oob_error = np.mean(oob_errors)
        return self
            
        
    def predict(self,X):
        if len(self.trees)==0:
            raise Exception("trees not initialised! need to first fit the model")

        n = X.shape[0]
        results = np.empty((self.n_trees,n))
        for row, tree in enumerate(self.trees):
            results[row] = tree.predict(X)
            
        return np.mean(results,axis=0)
        

In [52]:
nrf = NumericalRandomForest_regressor(3)
# no train and test, cause it's a forest
nrf.fit(data,values,depths=[10],minElems_perLeaf=[20]);

Fitting #1 tree
#1 fold:
	[                    ] 0.0%
#2 fold:
	[                    ] 0.0%
#3 fold:
	[                    ] 0.0%
Fitting #2 tree
#1 fold:
	[                    ] 0.0%
#2 fold:
	[                    ] 0.0%
#3 fold:
	[                    ] 0.0%
Fitting #3 tree
#1 fold:
	[                    ] 0.0%
#2 fold:
	[                    ] 0.0%
#3 fold:
	[                    ] 0.0%


In [42]:
nrf.oob_error

0.01237872915832776

In [43]:
pred = nrf.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.07093161098240276
R^2 score: 0.7753652339689243
Explained Variance Score: 0.7777124433711212


### Random Forest SkLearn class

In [44]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()
rfr.fit(trainVal_data, trainVal_values.ravel())
np.flip(np.argsort(rfr.feature_importances_))



array([ 50,  49, 100,   3,  51,  99,  91,  93,   2,  92,  78,  41,  68,
        69,  40,  38,  10,  44,  29,  17,  87,  57,  66,  94,  46,   4,
        15,  71,  14,  80,  98,  75,   1,  81,   6,  89,  43,  24,  67,
        65,  11,  20,  34, 101,  74,  72,  55,  58,  76,  16,   0,  95,
         8,  88,  62,  54,  23,  12,  47,  61,  39,  35,  82,  73,  21,
        96,  59,  18,  86,  36,  90,   7,  30,  45,  64,  31,  22,  60,
        28,  84,  26,  33,  37,   5,  32,  27,  25,  97,  13,  48,  63,
         9,  79,  77,  56,  19,  52,  53,  42,  85,  83,  70])

In [None]:
pred = rfr.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

# Predictors

## 1. Regularised Least Squares
   

In [None]:
class tikhonov_leastSquares:
    def __init__(self, weights = None):
        self.weights = weights
        
    def fit(self, X, y, _lambda):
        inv = np.linalg.inv(np.matmul(X.T, X) + _lambda*np.eye(X.shape[1]))
        self.weights = np.matmul(inv, np.matmul(X.T, y))
        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [None]:
k = 5
params_dict = {"_lambda":[2,2.05,2.1,2.2,3]}

tls = tikhonov_leastSquares()

win_regulariser = kFold_crossValidation_selectionGrid(k, params_dict, trainVal_data, trainVal_values, tls)
tls.fit(trainVal_data, trainVal_values, win_regulariser)
pred = tls.predict(test_data)

print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

## 2. Random Forest

In [None]:
k = 3 
params_dict = {"depth":[10,15,20,30],"minElem_perLeaf":[5,10,20,30]}

dt = NumericalDecisionTree_regressor()
win_params = kFold_crossValidation_selectionGrid(k, params_dict, trainVal_data, trainVal_values, dt)
dt.fit(trainVal_data, trainVal_values, depth=win_params[0], minElem_perLeaf=win_params[1])
pred = dt.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))