In [1]:
import numpy as np
import pandas as pd

# cross validation purposes: create the cartesian product between the chosen values sets
from itertools import product 

#import os
#import seaborn as sns
#import matplotlib.pyplot as plt
#%matplotlib inline

# Read Dataset

In [2]:
cmp = pd.read_csv("commViolUnnormData.txt", na_values='?')

In [3]:
# drop first non predictive features (communityname, state, countyCode, communityCode, "fold")
pred_features = cmp[cmp.columns[5:-18]]
regr_values = cmp[cmp.columns[-18:]]

# Drop features with a lot of missing values

In [4]:
print("Before dropping: {} features".format(str(pred_features.shape[1])))

#drop features that contain at least some threshold (from the total) of NaN values
cut_tresh = 0.75
to_drop = pred_features.columns[pred_features.count() < pred_features.shape[0]*cut_tresh]

pred_features = pred_features.drop(columns=to_drop)

print("After dropping: {} features".format(str(pred_features.shape[1])))

Before dropping: 124 features
After dropping: 102 features


# Imputing on features matrix

In [5]:
from collections import Counter

def value_withStrategy(v, strat):
    if strat == "mean":
        return np.mean(v)
    if strat == "median":
        return np.median(v)
    if strat == "most_frequent":
        return Counter(v).most_common(1)[0][0]
    print("Invalid imputing strategy!")
        
def imputing(df, strategy):
    # for each column that contain at least 1 NaN value...
    for nanCol in np.unique(np.where(pred_features.isna())[1]):
        nanRows = np.where(pred_features.iloc[:,nanCol].isna())[0] #find NaN rows for the current column
        available = df.iloc[~nanRows, nanCol]
        value = value_withStrategy(available, strategy) #compute the filling value
        df.iloc[nanRows, nanCol] = value

In [6]:
imputing(pred_features, "mean");

# Outliers Detection

-- TBD <br>
A thourough study from scratch of outliers detection is needed here, but for now it feels like it exceeds the course final project.

# Choose the Dependent Variable and drop possible missing values rows on it

In [7]:
def drop_naSample(df, vals):
    idxRow = np.where(vals.isna())[0]
    return df.drop(index=idxRow).values, vals.drop(index=idxRow).values.reshape(-1,1)

In [8]:
dep_var = "robbPerPop"
data,values = drop_naSample(pred_features, regr_values[dep_var])

# Normalisation

In [9]:
def normalise(matrix, strat):
    for j in range(matrix.shape[1]):
        mi = np.min(matrix[:,j])
        ma = np.max(matrix[:,j])
        di = ma-mi
        if (di > 1e-6):
            if strat=="0_mean,1_std":
                matrix[:,j] = (matrix[:,j]-np.mean(matrix[:,j]))/np.std(matrix[:,j])
            elif strat=="[0,1]":
                matrix[:,j] = (matrix[:,j]-mi)/di
            elif strat=="[-1,1]":
                matrix[:,j] = 2*((matrix[:,j]-mi)/di)-1
            else:
                print("Invalid normalisation strategy!")
        else:
            matrix[:,j] = 0

In [10]:
strategy = "[-1,1]"
normalise(data,strategy)
normalise(values,strategy)

# Train-Test Split

In [11]:
def trainTest_split(in_matrix, out_vect, train_amount=0.7):
    n,_ = in_matrix.shape

    trVl_Amount = int(n*train_amount) #training-validation amount
    indexes = np.random.permutation(n)
    idxTrVl = np.sort(indexes[0:trVl_Amount])
    idxTs = np.sort(indexes[trVl_Amount:])

    return in_matrix[idxTrVl], in_matrix[idxTs], out_vect[idxTrVl], out_vect[idxTs]

In [12]:
trainVal_data, test_data, trainVal_values, test_values = trainTest_split(data, values, train_amount=0.7)

# Evaluation Metrics

In [13]:
class Regression_evaluationMetric:
    def __init__(self, true, predicted):
        self.true = true.flatten()
        self.predicted = predicted.flatten()
        self.residuals = self.true-self.predicted
    
    def meanSquareError(self):
        return np.mean(np.square(self.residuals))
    
    def rootMeanSquareError(self):
        return np.sqrt(np.mean(np.square(self.residuals)))
    
    def meanAbsoluteError(self):
        return np.mean(np.abs(self.residuals))
    
    def rSquared(self):
        ss_residual = np.sum(np.square(self.residuals))
        ss_total = np.sum(np.square(self.true-np.mean(self.true)))        
        return 1 - ss_residual/ss_total
    
    def adjusted_rSquared(self, p):
        n = self.true.shape[0]
        return 1-(1-self.rSquared)*((n-1)/(n-p-1))


# Variable Selection

## 0. K-fold Cross Validation

In [14]:
from itertools import product

def kFold_crossValidation_selectionGrid(k, parameters_dict, train_data, train_values, predictor, verbose=False):
    nVal,_ = train_data.shape
    
    # Validation indexes adjustment -------------------------------
    elemPerFold, remainder = np.divmod(nVal,k) #the remainder will be distributed across the firsts folds
    valIdxList = []
    start = 0

    # in each fold put as many samples as the division quotient +1 if the remainder is still positive
    # then decrease the division remainder by 1
    for i in range(k): 
        end = start+elemPerFold+int(remainder>0)
        valIdxList.append(np.arange(start,end)) 
        remainder -= 1
        start = end
    
    # Cross validation --------------------------------------------
    params_names = parameters_dict.keys()
    params_product = list(product(*parameters_dict.values())) # build all the hyp-par combination
    val_results = np.empty((len(valIdxList),len(params_product)))
    
    for row, valIdx in enumerate(valIdxList): # for each fold
        if verbose: print("#{} fold:".format(row+1))
        for col, params in enumerate(params_product):
            
            if verbose:
                update = col*100/len(params_product) # just print completion rate
                print("\t["+"#"*(int(update/5))+" "*(int((100-update)/5))+"] {}%".format(update))
                     
            arg_dict = {k:v for k,v in zip(params_names,params)} # {argument_name:argument_value, ... }
            
            
            predictor.fit(train_data[~valIdx], train_values[~valIdx], **arg_dict)
            pred = predictor.predict(train_data[valIdx])
            
            rem = Regression_evaluationMetric(trainVal_values[valIdx], pred)
            #val_results[row,col] = rem.rSquared()
            val_results[row,col] = rem.rootMeanSquareError()
            
    selected = np.argmin(val_results.mean(axis=0))
    return params_product[selected]

## 1. Matching Pursuit

### Project class definition

In [15]:
class matchingPursuit:
    def __init__(self, iterations, weights = None, indexes = None):
        self.iterations = iterations
        self.weights = weights
        self.indexes = indexes
        
    def fit(self, data_matrix, output_vect):
        residual = output_vect.copy()
        self.weights = np.zeros((data_matrix.shape[1], 1))
        self.indexes = []

        #data_2norm = np.sqrt(np.sum(np.square(data_matrix), axis=0))
        data_2norm = np.linalg.norm(data_matrix, ord=2, axis=0).reshape(1,-1)

        for i in range(self.iterations):
            
            # project each column on the current residuals
            projection = np.matmul(residual.T, data_matrix)
            # find the most correlated variable
            k = np.argmax(np.divide(np.square(projection), data_2norm))
            self.indexes.append(k)
            
            distance = projection[0,k]/np.linalg.norm(data_matrix[:,k], ord=2)
            self.weights[k,0] += distance # update the solution vector: canonical basis over the found column
            residual -= np.matmul(data_matrix, self.weights) # update the residual

        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [16]:
mp = matchingPursuit(iterations=10)
mp.fit(trainVal_data, trainVal_values)
np.where(mp.weights)[0]

array([49, 92])

In [17]:
pred = mp.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 1.3161066161343946e+29
Root Mean Square Error: 4576011566576380.0
R^2 score: -7.51760865856762e+32


### SkLearn Class

In [18]:
from sklearn.linear_model import orthogonal_mp
omp_coef = orthogonal_mp(trainVal_data, trainVal_values)
np.where(omp_coef)[0]

array([ 3, 11, 34, 38, 50, 76, 77, 92, 94, 96])

In [19]:
pred = np.matmul(test_data, omp_coef)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.057784752532006364
Root Mean Square Error: 0.10298488563047162
R^2 score: 0.6192389909724942


## 2. L1 Penalty (Lasso)

### Project class definition

In [20]:
class lasso_regression: # Iterative Soft Thresholding Algorithm (Proximal Gradient)
    def __init__(self, iterations, weights=None):
        self.iterations = iterations
        self.weights = weights
        
    def fit(self, data_matrix, output_vect, _lambda):
        self.weights = np.zeros((data_matrix.shape[1],1))
        n = data_matrix.shape[0]
        # convergence step-size: n/(2*||X^t*X||_2)
        step = n/(2*np.linalg.norm(np.matmul(data_matrix.T, data_matrix), ord=2))
        softTresh = step*_lambda

        for i in range(self.iterations):
            # gradient step of the lasso formulation
            dist = np.matmul(data_matrix, self.weights) - output_vect
            coord_descent = (step/n)*np.matmul(data_matrix.T, dist)
            self.weights -= coord_descent

            # soft thresholding operator
            upper = self.weights > softTresh  # elem to be reduced
            lower = self.weights < -softTresh # elem to be increased
            self.weights[upper] -= softTresh
            self.weights[lower] += softTresh
            self.weights[~upper & ~lower] = 0

        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [21]:
lr = lasso_regression(iterations=10)
lr.fit(trainVal_data, trainVal_values, 0.8)
np.where(lr.weights)[0]

array([  0,  10,  27,  49,  51,  71,  91,  92,  98, 101])

In [22]:
pred = lr.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.02762789280250509
Root Mean Square Error: 0.8615803238857713
R^2 score: -25.649988847827537


### SkLearn Class

In [23]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.005)
lasso.fit(trainVal_data, trainVal_values)
np.where(lasso.coef_)[0]

array([  2,   3,  11,  38,  44,  50,  74,  76,  93,  94, 100])

In [24]:
pred = lasso.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.049602139168549705
Root Mean Square Error: 0.10064093434112195
R^2 score: 0.6363741017053486


## 3. Random Forest

### Decision Tree project class definition

In [25]:
class NumericalDecisionTree_regressor: # Least Square Regression Tree with fixed parameter (no pruning)
    class Node:
        def __init__(self, isLeaf=False, feature=None, cut=None, average=None, left=None, right=None):
            self.isLeaf = isLeaf
            self.feature = feature
            self.cut = cut
            self.avg = average
            self.left = left
            self.right = right

        def print_tree(self):
            if self.left: self.left.print_tree()
            if self.cut:
                print("Feature: {}, cut: {}\n".format(self.feature, self.cut))
            else:
                print("Leaf => {}\n".format(self.avg))
            if self.right: self.right.print_tree()

        def print_tree_indented(self, level=0):
            if self.right: self.right.print_tree_indented(level+1)
            if self.cut:
                print("|    "*level+"{} => {}".format(self.feature, self.cut))
            else:
                print("|    "*level+"Leaf: {}".format(self.avg))                
            if self.left: self.left.print_tree_indented(level+1)
            
    def __init__(self):
        self.root = None
        
    def fit(self, X, y, depth, minElem_perLeaf, pruning=False):
        if not pruning:
            self.root = self.learn(X, y, depth, minElem_perLeaf)
        else:
            # train dataset, pruning dataset
            X_trn, X_val, y_trn, y_val = trainTest_split(X, y, train_amount=0.7)
            self.root = self.learn(X_trn, y_trn, depth, minElem_perLeaf)
            self.prune(X_val, y_val)
        
        return self
        
    def learn(self, X, y, depth, minElem_perLeaf):
        n, d = X.shape

        if depth==0 or n<=minElem_perLeaf: # leaf # or fraction error of the root node??? 
            return self.Node(isLeaf=True, average=np.mean(y))
            
        best_costDescent = 0 # split that maximise the error descent

        for i1 in range(d):
            sorted_idx = np.argsort(X[:,i1])
            sorted_x, sorted_y = X[sorted_idx, i1], y[sorted_idx]

            s_right, s_left = np.sum(sorted_y), 0
            n_right, n_left = n, 0

            for i2 in range(n-1):
                s_left += sorted_y[i2]
                s_right -= sorted_y[i2]
                n_left += 1
                n_right -= 1
                
                if sorted_x[i2]<sorted_x[i2+1]: # for a different value
                    # try to maximise this value: it is directly correlated 
                    # to the possible split information gain
                    new_costDescent = (s_left**2)/n_left + (s_right**2)/n_right
                    if new_costDescent > best_costDescent:
                        best_costDescent = new_costDescent
                        best_feature = i1
                        best_cut = (sorted_x[i2]+sorted_x[i2+1])/2

        left_idxs = X[:,best_feature] < best_cut

        return self.Node(feature=best_feature, cut=best_cut, average=np.mean(y),
                         left = self.learn(X[left_idxs], y[left_idxs], depth-1, minElem_perLeaf),
                         right = self.learn(X[~left_idxs], y[~left_idxs], depth-1, minElem_perLeaf))
    
    def prune(self, X, y):
        # for statistics purposes check errors on different dataset portions and average them
        # in order to decide whether to prune or not (same code of k-fold cross-validation)
        n,_ = X.shape
        folds = 5
        elemPerFold, remainder = np.divmod(n, folds)
        foldsIdxsList = []
        start = 0
        for i in range(folds): 
            end = start+elemPerFold+int(remainder>0)
            foldsIdxsList.append(np.arange(start,end)) 
            remainder -= 1
            start = end
        
        # recursive: start checking if the root receive a possible positive pruning from its sons
        self.test_pruning(self.root, X, y, foldsIdxsList)
        return self
    
    def test_pruning(self, node, X, y, foldIdxs):
        if node.isLeaf: # leaf: start point of new pruning
            return True
        
        # check sons response: if they both are positive to be pruned it means that the current node
        # might need to become a leaf and not split deeper anymore
        if self.test_pruning(node.left, X, y, foldIdxs) and self.test_pruning(node.right, X, y, foldIdxs):
            
            folds = len(foldIdxs)
            results = np.empty(folds)
            
            # not pruned errors
            for i, idxs in enumerate(foldIdxs):
                pred = self.predict(X[idxs])
                results[i] = Regression_evaluationMetric(true=y[idxs], predicted=pred).rootMeanSquareError()
            
            not_prunErr = np.mean(results)
            
            # pruned errors
            node.isLeaf = True
            for i, idxs in enumerate(foldIdxs):
                pred = self.predict(X[idxs])
                results[i] = Regression_evaluationMetric(true=y[idxs], predicted=pred).rootMeanSquareError()
            
            # if pruning improves the prediction RMSE then keep current node as leaf
            node.isLeaf = np.mean(results) <= not_prunErr
            return node.isLeaf
        
        # else one of the sons performs a good predictive split: it must not be pruned
        return False
    
    def predict(self, X):
        if self.root is None:
            raise Exception("Tree not initialised! need to first fit the model")

        n = X.shape[0]
        y = np.empty(n)
        
        for i in range(n):
            current = self.root
            while not current.isLeaf:
                if X[i,current.feature] < current.cut:
                    current = current.left
                else:
                    current = current.right
                
            y[i] = current.avg
        
        return y
                
    def pprint(self):
        self.root.print_tree_indented()

In [26]:
ndt = NumericalDecisionTree_regressor()
ndt.fit(trainVal_data, trainVal_values, depth=5, minElem_perLeaf=10)
ndt.pprint()

|    |    |    |    |    Leaf: -0.02019552504390083
|    |    |    |    42 => -0.17872340425531918
|    |    |    |    |    Leaf: -0.3695978146131185
|    |    |    2 => -0.5286024619840696
|    |    |    |    |    Leaf: -0.6520880966297086
|    |    |    |    3 => 0.35317173800928303
|    |    |    |    |    Leaf: -0.41632397433009594
|    |    51 => -0.9955245327332757
|    |    |    |    |    Leaf: -0.45089029447259765
|    |    |    |    58 => -0.9755327545382794
|    |    |    |    |    Leaf: -0.732453525195108
|    |    |    74 => -0.6731010278265229
|    |    |    |    |    Leaf: -0.6230602283608756
|    |    |    |    10 => -0.992874626974923
|    |    |    |    |    Leaf: -0.793282857981379
|    43 => -0.4063429137760157
|    |    |    Leaf: 0.5643506924661275
|    |    100 => -0.33296521258972944
|    |    |    Leaf: -0.08904033288233844
50 => -0.5638025594149909
|    |    |    Leaf: -0.26444300165332085
|    |    69 => 0.5410706545296922
|    |    |    |    |    Leaf: -0.635

In [27]:
pred = ndt.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.05715938965530367
Root Mean Square Error: 0.10766353387041577
R^2 score: 0.5838568518087555


In [28]:
ndt = NumericalDecisionTree_regressor()
ndt.fit(trainVal_data, trainVal_values, depth=100, minElem_perLeaf=10, pruning=True)
ndt.pprint()

|    |    |    |    |    Leaf: 0.5063225168166138
|    |    |    |    38 => 0.28077994428969355
|    |    |    |    |    Leaf: 0.08044002184797978
|    |    |    100 => -0.5212589729431253
|    |    |    |    |    |    Leaf: 0.054104036176620345
|    |    |    |    |    88 => 0.38118811881188097
|    |    |    |    |    |    Leaf: -0.26555895642034694
|    |    |    |    96 => 0.6098484848484849
|    |    |    |    |    Leaf: -0.40729551748353676
|    |    50 => -0.14040219378427793
|    |    |    |    Leaf: -0.6968754738759112
|    |    |    8 => -0.49423533401152936
|    |    |    |    |    Leaf: -0.1376979237057943
|    |    |    |    4 => -0.6630680828835103
|    |    |    |    |    |    Leaf: -0.27286860736795154
|    |    |    |    |    71 => -0.8497267443206817
|    |    |    |    |    |    |    |    Leaf: -0.2557267345199134
|    |    |    |    |    |    |    31 => -0.7490651192778852
|    |    |    |    |    |    |    |    Leaf: -0.4761723045938175
|    |    |    |    |    |  

In [29]:
pred = ndt.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.05979815613844872
Root Mean Square Error: 0.12097742246932931
R^2 score: 0.4745708761706281


### Decision Tree SkLearn Class

In [30]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(trainVal_data, trainVal_values)
np.flip(np.argsort(dtr.feature_importances_))

array([ 50,  43,  51,   2, 100,  69,  74,  49,  42,   3,   8,  36,  65,
        10,  85,  58,  32,  66,  71,  40,  82,  41,  48,  88,   0,  37,
        46,  92,  44,  47,  86,  99,  28,  24,  39,  22,   9,  52,  84,
        54,  79,  14,  21,  94,  81,  76,  93,  45,  91,  15,  75,  18,
        16,  35,  95,  11,  90,   7,  67,  60,  96,  83,  62,  26,  38,
        61,  98,   5,   1,  33,  29,  77,  31,  73,  53,  27,  19,  59,
        55,  23,  30,  17,  78,  72,  89,  57,  64,   4,  56,  25,  68,
        63,  87,  34,   6,  80,  97, 101,  20,  13,  12,  70])

In [31]:
pred = dtr.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.06408277326879126
Root Mean Square Error: 0.11786492244595967
R^2 score: 0.5012594997718978


### Random Forest project class definition

In [32]:
class NumericalRandomForest_regressor: # post-pruning (kinda) with cross-validation or greedy 
    def __init__(self, n_trees):
        self.n_trees = n_trees
        self.trees = []
        self.boot_samplesIdxs = []
        self.oob_error = None
        
    def fit(self, X, y, depth, minElems_perLeaf):
        n = X.shape[0]
        n_learn = int(n/3) # Bootstrap amount to be taken aside
        
        #val_folds = 3
        #params_dict = {"depth":depths, "minElem_perLeaf":minElems_perLeaf}
        
        # Fitting the forest -----------------------------------
        for i in range(self.n_trees):
            print("Fitting #{} tree".format(i+1))
            
            bootstrap_idxs = np.sort(np.random.permutation(n)[:n_learn])
            self.boot_samplesIdxs.append(bootstrap_idxs)
                        
            dt = NumericalDecisionTree_regressor()
            
            # find the best hyp-par for the current setting (bootstrapping)
            #win_params = kFold_crossValidation_selectionGrid(val_folds, params_dict, 
            #                                                 X[~bootstrap_idxs], y[~bootstrap_idxs],
            #                                                 dt, verbose=True)
            #self.trees.append(dt.fit(X[~bootstrap_idxs], y[~bootstrap_idxs],
            #                         depth=win_params[0], minElem_perLeaf=win_params[1]))
            
            self.trees.append(dt.fit(X[~bootstrap_idxs], y[~bootstrap_idxs],
                                     depth=depth, minElem_perLeaf=minElems_perLeaf, pruning=True))
        
        # Out-Of-Bag Estimate for the forest -------------------
        oob_errors = []
        for sampleIdx in range(n):
            missingBoot_TreesIdx = [idx for idx,bootstrap_idxs in enumerate(self.boot_samplesIdxs) 
                                    if sampleIdx not in bootstrap_idxs]

            if len(missingBoot_TreesIdx) == 0: continue
            
            regr_results = np.empty(len(missingBoot_TreesIdx)) # regression estimate of the selected trees
            for i, missing_tree in enumerate(missingBoot_TreesIdx):
                # reshape in order to correctly use the decision_tree.predict(...): it needs a matrix (num,dim)
                # while numpy matrix indexing returns (dim,)
                regr_results[i] = self.trees[missing_tree].predict(X[sampleIdx].reshape(1,-1))
            
            # done at this level of granularity because a sample might end up in 
            # being part of no bootstrap set of any tree (so we cannot predict wich value in y will be used)
            oob_errors.append(np.square(y[sampleIdx]-np.mean(regr_results)))
            #oob_errors.append(r2_score(np.mean(regr_results),y[sampleIdx]))
            #oob_errors.append(explained_variance_score(np.mean(regr_results),y[sampleIdx]))
            
        self.oob_error = np.sqrt(np.mean(oob_errors))
        return self
            
        
    def predict(self,X):
        if len(self.trees)==0:
            raise Exception("Trees not initialised! need to first fit the model")

        n = X.shape[0]
        results = np.empty((self.n_trees,n))
        for row, tree in enumerate(self.trees):
            results[row] = tree.predict(X)
            
        return np.mean(results,axis=0)

In [33]:
nrf = NumericalRandomForest_regressor(3)
# no train and test, cause it's a forest
nrf.fit(data, values, depth=100, minElems_perLeaf=5);
nrf.oob_error

Fitting #1 tree
Fitting #2 tree
Fitting #3 tree


0.11144806836915046

### Random Forest SkLearn class

In [34]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()
rfr.fit(trainVal_data, trainVal_values.ravel())
np.flip(np.argsort(rfr.feature_importances_))



array([ 50,  49,   3, 100,  51,  69,  91,  43,   2,  92,  38,  71,  41,
        99,  44,  93,  15,  10,   6,  40,  36,  88,  46,  61,  24,  89,
         4,  27,  63,  42,  75,  23,  96,  74,   8,  35,  26,  77,  68,
        86,  48,  67,  82,  65,  28,  22,  58,  34,  14,  52,  11,  73,
        78,   7,  32,   0,  95,  45,  25,  21,  20,  98,  47,  54,  64,
        90,  30,   5,  13,  60,  55,  94,  80,  17,  29,  59,  97,  18,
       101,  53,  33,  12,  56,   1,  16,  85,   9,  72,  19,  79,  83,
        39,  62,  37,  66,  76,  87,  31,  81,  57,  84,  70])

In [35]:
pred = rfr.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.05576825819482322
Root Mean Square Error: 0.09610909195758337
R^2 score: 0.6683847951893649


# Predictors

## 1. Regularised Least Squares
   

In [36]:
class tikhonov_leastSquares:
    def __init__(self, weights = None):
        self.weights = weights
        
    def fit(self, X, y, _lambda):
        inv = np.linalg.inv(np.matmul(X.T, X) + _lambda*np.eye(X.shape[1]))
        self.weights = np.matmul(inv, np.matmul(X.T, y))
        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [37]:
k = 5
params_dict = {"_lambda":[2,2.05,2.1,2.2,3]}

tls = tikhonov_leastSquares()

win_regulariser = kFold_crossValidation_selectionGrid(k, params_dict, trainVal_data, trainVal_values, tls)
tls.fit(trainVal_data, trainVal_values, win_regulariser)
pred = tls.predict(test_data)

rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.009269216592095427
Root Mean Square Error: 0.09711437801726346
R^2 score: 0.6614112274436692


## 2. Random Forest

In [38]:
rf = NumericalRandomForest_regressor(5)
rf.fit(trainVal_data, trainVal_values, depth=100, minElems_perLeaf=10);

pred = rf.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Fitting #1 tree
Fitting #2 tree
Fitting #3 tree
Fitting #4 tree
Fitting #5 tree
Residual variance: 0.053474362312960155
Root Mean Square Error: 0.10121891146292093
R^2 score: 0.6321855288668612


In [39]:
rf.oob_error

0.11119805045257683

## 3. SVM

In [40]:
class linear_SupportVector_regression:
    def __init__(self, weight=None, alpha=None, bias=None):
        self.x = alpha
        self.w = weight
        self.bias = bias
        self.Nabla = None
                
    def SMO2_ab(self, n, H, f, a, LB, UB, maxiter, eps, alpha_s):
        """
        % min_{x} .5 x H x + f' x 
        %         LB <= x <= UB
        %         a' x = b
        % n         grandezza problema length(x)
        % maxiter   max num it
        % eps       precisione
        % alpha_s   punto di inizio valido per x
        % Nabla     ....
        % err       flag di ok
        % x         valore della soluzione ottima
        % bias      ....
        """
        self.x = alpha_s
        self.Nabla = f
        for i in range(n):
            if (self.x[i] != 0.0):
                for j in range(n):
                    self.Nabla[j] += H[j,i] * self.x[i]
        iter_ = 0
        while True:
            minF_up = float("inf");
            maxF_low = float("-inf");
            for i in range(n): 
                F_i = self.Nabla[i]/a[i]
                if (LB[i] < self.x[i]) and (self.x[i] < UB[i]) :
                    if (minF_up > F_i):
                        minF_up = F_i
                        u = i
                    if (maxF_low < F_i):
                        maxF_low = F_i
                        v = i
                elif (((a[i] > 0) and (self.x[i] == LB[i])) or ((a[i] < 0) and (self.x[i] == UB[i]))) : 
                    if (minF_up > F_i):
                        minF_up = F_i
                        u = i
                elif (((a[i] > 0) and (self.x[i] == UB[i])) or ((a[i] < 0) and (self.x[i] == LB[i]))) : 
                    if (maxF_low < F_i):
                        maxF_low = F_i
                        v = i
            if ((maxF_low - minF_up) <= eps):
                err = 0.0
                break

            iter_ += 1
            if (iter_ >= maxiter):
                err = 1.0
                break

            if (a[u] > 0):
                tau_lb = (LB[u]-self.x[u])*a[u] 
                tau_ub = (UB[u]-self.x[u])*a[u] 
            else:
                tau_ub = (LB[u]-self.x[u])*a[u] 
                tau_lb = (UB[u]-self.x[u])*a[u]

            if (a[v] > 0):
                tau_lb = max(tau_lb,(self.x[v]-UB[v])*a[v]) 
                tau_ub = min(tau_ub,(self.x[v]-LB[v])*a[v]) 
            else:
                tau_lb = max(tau_lb,(self.x[v]-LB[v])*a[v]) 
                tau_ub = min(tau_ub,(self.x[v]-UB[v])*a[v])

            tau = (self.Nabla[v]/a[v]-self.Nabla[u]/a[u])/(H[u,u]/(a[u]*a[u])
                                                           +H[v,v]/(a[v]*a[v])
                                                           -2*H[v,u]/(a[u]*a[v]))
            tau = min(max(tau,tau_lb),tau_ub)
            self.x[u] += tau/a[u]
            self.x[v] -= tau/a[v]

            for i in range(n):
                self.Nabla[i] += H[u,i]*tau/a[u] - H[v,i]*tau/a[v]

        tsv = 0
        self.bias = 0.0

        for k in range(n):
            if ((self.x[k] > LB[k]) and (self.x[k] < UB[k])):
                self.bias -= self.Nabla[k]/a[k]
                tsv += 1

        if (tsv > 0):
            self.bias /= tsv
        else:    
            self.bias = -(maxF_low + minF_up)/2.0

        return err
    
    def fit(self, X, y, C):
        n = X.shape[0]
        cov = np.matmul(X, X.T)
        Q = np.matmul(np.matmul(np.diag(y.flatten()), cov),
                      np.diag(y.flatten()))
        
        if self.SMO2_ab(n,Q,-np.ones(n),y.flatten(),
                   np.zeros(n),C*np.ones(n),10000000,.0001,np.zeros(n)):
            print("Problem in SMO")
            
        self.w = np.matmul(np.matmul(X.T, np.diag(y.flatten())),
                           self.x)
        
        return self
    
    def predict(self, X):
        return np.matmul(X, self.w) + self.bias

In [41]:
lsvr = linear_SupportVector_regression()
lsvr.fit(trainVal_data, trainVal_values, C=1.0);

pred = lsvr.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 5.6294441029468105
Root Mean Square Error: 8.921663229705644
R^2 score: -2856.5717143966876


In [42]:
np.where(lsvr.x)

(array([  10,   43,   97,  110,  112,  114,  121,  122,  129,  244,  246,
         250,  258,  267,  299,  319,  373,  382,  387,  421,  459,  499,
         546,  547,  564,  579,  594,  618,  631,  699,  842,  908,  921,
         986, 1002, 1013, 1031, 1076, 1103, 1119, 1121, 1186, 1205, 1260,
        1302, 1420, 1437, 1444, 1469, 1489]),)

In [43]:
from sklearn.svm import SVR
svr = SVR(kernel="linear", tol=.0001, C=1)
svr.fit(trainVal_data, trainVal_values.flatten());

In [44]:
np.where(svr.dual_coef_)[1]

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [45]:
pred = svr.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.06464753396419612
Root Mean Square Error: 0.1109680542840853
R^2 score: 0.5579194281626387
