In [66]:
import numpy as np
import pandas as pd

# cross validation purposes: create the cartesian product between the chosen values sets
from itertools import product 

#import os
#import seaborn as sns
#import matplotlib.pyplot as plt
#%matplotlib inline

# Read Dataset

In [67]:
cmp = pd.read_csv("commViolUnnormData.txt", na_values='?')

In [68]:
# drop first non predictive features (communityname, state, countyCode, communityCode, "fold")
pred_features = cmp[cmp.columns[5:-18]]
regr_values = cmp[cmp.columns[-18:]]

# Drop features with a lot of missing values

In [69]:
print("Before dropping: {} features".format(str(pred_features.shape[1])))

#drop features that contain at least some threshold (from the total) of NaN values
cut_tresh = 0.75
to_drop = pred_features.columns[pred_features.count() < pred_features.shape[0]*cut_tresh]

pred_features = pred_features.drop(columns=to_drop)

print("After dropping: {} features".format(str(pred_features.shape[1])))

Before dropping: 124 features
After dropping: 102 features


# Imputing on features matrix

In [70]:
from collections import Counter

def value_withStrategy(v, strat):
    if strat == "mean":
        return np.mean(v)
    if strat == "median":
        return np.median(v)
    if strat == "most_frequent":
        return Counter(v).most_common(1)[0][0]
    print("Invalid imputing strategy!")
        
def imputing(df, strategy):
    # for each column that contain at least 1 NaN value...
    for nanCol in np.unique(np.where(pred_features.isna())[1]):
        nanRows = np.where(pred_features.iloc[:,nanCol].isna())[0] #find NaN rows for the current column
        available = df.iloc[~nanRows, nanCol]
        value = value_withStrategy(available, strategy) #compute the filling value
        df.iloc[nanRows, nanCol] = value

In [71]:
imputing(pred_features, "mean");

# Outliers Detection

-- TBD <br>
A thourough study from scratch of outliers detection is needed here, but for now it feels like it exceeds the course final project.

# Choose the Dependent Variable and drop possible missing values rows on it

In [72]:
def drop_naSample(df, vals):
    idxRow = np.where(vals.isna())[0]
    return df.drop(index=idxRow).values, vals.drop(index=idxRow).values.reshape(-1,1)

In [73]:
dep_var = "robbPerPop"
data,values = drop_naSample(pred_features, regr_values[dep_var])

# Normalisation

In [74]:
def normalise(matrix, strat):
    for j in range(matrix.shape[1]):
        mi = np.min(matrix[:,j])
        ma = np.max(matrix[:,j])
        di = ma-mi
        if (di > 1e-6):
            if strat=="0_mean,1_std":
                matrix[:,j] = (matrix[:,j]-np.mean(matrix[:,j]))/np.std(matrix[:,j])
            elif strat=="[0,1]":
                matrix[:,j] = (matrix[:,j]-mi)/di
            elif strat=="[-1,1]":
                matrix[:,j] = 2*((matrix[:,j]-mi)/di)-1
            else:
                print("Invalid normalisation strategy!")
        else:
            matrix[:,j] = 0

In [75]:
strategy = "[-1,1]"
normalise(data,strategy)
normalise(values,strategy)

# Train-Test Split

In [76]:
def trainTest_split(in_matrix, out_vect, train_amount=0.7):
    n,_ = in_matrix.shape

    trVl_Amount = int(n*train_amount) #training-validation amount
    indexes = np.random.permutation(n)
    idxTrVl = np.sort(indexes[0:trVl_Amount])
    idxTs = np.sort(indexes[trVl_Amount:])

    return in_matrix[idxTrVl], in_matrix[idxTs], out_vect[idxTrVl], out_vect[idxTs]

In [77]:
trainVal_data, test_data, trainVal_values, test_values = trainTest_split(data, values, train_amount=0.7)

# Evaluation Metrics

In [78]:
class Regression_evaluationMetric:
    def __init__(self, true, predicted):
        self.true = true.flatten()
        self.predicted = predicted.flatten()
        self.residuals = self.true-self.predicted
    
    def meanSquareError(self):
        return np.mean(np.square(self.residuals))
    
    def rootMeanSquareError(self):
        return np.sqrt(np.mean(np.square(self.residuals)))
    
    def meanAbsoluteError(self):
        return np.mean(np.abs(self.residuals))
    
    def rSquared(self):
        ss_residual = np.sum(np.square(self.residuals))
        ss_total = np.sum(np.square(self.true-np.mean(self.true)))        
        return 1 - ss_residual/ss_total
    
    def adjusted_rSquared(self, p):
        n = self.true.shape[0]
        return 1-(1-self.rSquared)*((n-1)/(n-p-1))


# Variable Selection

## 0. K-fold Cross Validation

In [79]:
from itertools import product

def kFold_crossValidation_selectionGrid(k, parameters_dict, train_data, train_values, predictor, verbose=False):
    nVal,_ = train_data.shape
    
    # Validation indexes adjustment -------------------------------
    elemPerFold, remainder = np.divmod(nVal,k) #the remainder will be distributed across the firsts folds
    valIdxList = []
    start = 0

    # in each fold put as many samples as the division quotient +1 if the remainder is still positive
    # then decrease the division remainder by 1
    for i in range(k): 
        end = start+elemPerFold+int(remainder>0)
        valIdxList.append(np.arange(start,end)) 
        remainder -= 1
        start = end
    
    # Cross validation --------------------------------------------
    params_names = parameters_dict.keys()
    params_product = list(product(*parameters_dict.values())) # build all the hyp-par combination
    val_results = np.empty((len(valIdxList),len(params_product)))
    
    for row, valIdx in enumerate(valIdxList): # for each fold
        if verbose: print("#{} fold:".format(row+1))
        for col, params in enumerate(params_product):
            
            if verbose:
                update = col*100/len(params_product) # just print completion rate
                print("\t["+"#"*(int(update/5))+" "*(int((100-update)/5))+"] {}%".format(update))
                     
            arg_dict = {k:v for k,v in zip(params_names,params)} # {argument_name:argument_value, ... }
            
            
            predictor.fit(train_data[~valIdx], train_values[~valIdx], **arg_dict)
            pred = predictor.predict(train_data[valIdx])
            
            rem = Regression_evaluationMetric(trainVal_values[valIdx], pred)
            #val_results[row,col] = rem.rSquared()
            val_results[row,col] = rem.rootMeanSquareError()
            
    selected = np.argmin(val_results.mean(axis=0))
    return params_product[selected]

## 1. Matching Pursuit

### Project class definition

In [80]:
class matchingPursuit:
    def __init__(self, iterations, weights = None, indexes = None):
        self.iterations = iterations
        self.weights = weights
        self.indexes = indexes
        
    def fit(self, data_matrix, output_vect):
        residual = output_vect.copy()
        self.weights = np.zeros((data_matrix.shape[1], 1))
        self.indexes = []

        #data_2norm = np.sqrt(np.sum(np.square(data_matrix), axis=0))
        data_2norm = np.linalg.norm(data_matrix, ord=2, axis=0).reshape(1,-1)

        for i in range(self.iterations):
            
            # project each column on the current residuals
            projection = np.matmul(residual.T, data_matrix)
            # find the most correlated variable
            k = np.argmax(np.divide(np.square(projection), data_2norm))
            self.indexes.append(k)
            
            distance = projection[0,k]/np.linalg.norm(data_matrix[:,k], ord=2)
            self.weights[k,0] += distance # update the solution vector: canonical basis over the found column
            residual -= np.matmul(data_matrix, self.weights) # update the residual

        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [81]:
mp = matchingPursuit(iterations=10)
mp.fit(trainVal_data, trainVal_values)
np.where(mp.weights)[0]

array([49, 92])

In [82]:
pred = mp.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 1.3142650125262349e+29
Root Mean Square Error: 4621874818400587.0
R^2 score: -5.8928653264817184e+32


### SkLearn Class

In [83]:
from sklearn.linear_model import orthogonal_mp
omp_coef = orthogonal_mp(trainVal_data, trainVal_values)
np.where(omp_coef)[0]

array([ 3, 11, 34, 38, 50, 76, 77, 92, 93, 94])

In [84]:
pred = np.matmul(test_data, omp_coef)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.06476593732076837
Root Mean Square Error: 0.10910037408315335
R^2 score: 0.671645752809883


## 2. L1 Penalty (Lasso)

### Project class definition

In [85]:
class lasso_regression: # Iterative Soft Thresholding Algorithm (Proximal Gradient)
    def __init__(self, iterations, weights=None):
        self.iterations = iterations
        self.weights = weights
        
    def fit(self, data_matrix, output_vect, _lambda):
        self.weights = np.zeros((data_matrix.shape[1],1))
        n = data_matrix.shape[0]
        # convergence step-size: n/(2*||X^t*X||_2)
        step = n/(2*np.linalg.norm(np.matmul(data_matrix.T, data_matrix), ord=2))
        softTresh = step*_lambda

        for i in range(self.iterations):
            # gradient step of the lasso formulation
            dist = np.matmul(data_matrix, self.weights) - output_vect
            coord_descent = (step/n)*np.matmul(data_matrix.T, dist)
            self.weights -= coord_descent

            # soft thresholding operator
            upper = self.weights > softTresh  # elem to be reduced
            lower = self.weights < -softTresh # elem to be increased
            self.weights[upper] -= softTresh
            self.weights[lower] += softTresh
            self.weights[~upper & ~lower] = 0

        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [86]:
lr = lasso_regression(iterations=10)
lr.fit(trainVal_data, trainVal_values, 0.8)
np.where(lr.weights)[0]

array([  0,  10,  27,  49,  51,  71,  91,  92,  98, 101])

In [87]:
pred = lr.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.035949252124582956
Root Mean Square Error: 0.8445367975114345
R^2 score: -18.675569556094118


### SkLearn Class

In [88]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.005)
lasso.fit(trainVal_data, trainVal_values)
np.where(lasso.coef_)[0]

array([  3,  11,  38,  44,  50,  76,  93,  94, 100])

In [89]:
pred = lasso.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.05678934411253694
Root Mean Square Error: 0.11321034607324221
R^2 score: 0.6464405946799007


## 3. Random Forest

### Decision Tree project class definition

In [147]:
class NumericalDecisionTree_regressor: # Least Square Regression Tree with either fixed parameter or pruning
    class Node:
        def __init__(self, isLeaf=False, feature=None, feature_importance=None, cut=None, average=None,
                     left=None, right=None):
            self.isLeaf = isLeaf
            self.feature = feature # if internal, on wich feature it executes the split
            self.feature_importance = feature_importance # solution variance reduction
            self.cut = cut # if internal, threahold value for the cut
            self.avg = average # mean of seen training values
            self.left = left
            self.right = right

        def print_tree(self):
            if self.left: self.left.print_tree()
            if self.cut:
                print("Feature: {}, cut: {}\n".format(self.feature, self.cut))
            else:
                print("Leaf => {}\n".format(self.avg))
            if self.right: self.right.print_tree()

        def print_tree_indented(self, level=0):
            if self.right: self.right.print_tree_indented(level+1)
            if self.cut:
                print("|    "*level+"{} => {}".format(self.feature, self.cut))
            else:
                print("|    "*level+"Leaf: {}".format(self.avg))                
            if self.left: self.left.print_tree_indented(level+1)
            
    def __init__(self, root=None, feature_importances=None):
        self.root = root
        self.feature_importances = feature_importances

        
    def fit(self, X, y, depth, minElem_perLeaf, pruning=False):
        
        self.feature_importances = {k:0 for k in range(X.shape[1])}
        
        if not pruning:
            self.root = self.learn(X, y.flatten(), depth, minElem_perLeaf)
        else:
            # train dataset, pruning dataset
            X_trn, X_val, y_trn, y_val = trainTest_split(X, y.flatten(), train_amount=0.7)
            self.root = self.learn(X_trn, y_trn, depth, minElem_perLeaf)
            self.prune(X_val, y_val)
        
        return self
        
    def learn(self, X, y, depth, minElem_perLeaf):
        n, d = X.shape

        if depth==0 or n<=minElem_perLeaf: # leaf # or fraction error of the root node??? 
            return self.Node(isLeaf=True, average=np.mean(y))
            
        best_costDescent = 0 # split that maximise the error descent

        for i1 in range(d):
            sorted_idx = np.argsort(X[:,i1])
            sorted_x, sorted_y = X[sorted_idx, i1], y[sorted_idx]

            s_right, s_left = np.sum(sorted_y), 0
            n_right, n_left = n, 0

            for i2 in range(n-1):
                s_left += sorted_y[i2]
                s_right -= sorted_y[i2]
                n_left += 1
                n_right -= 1
                
                if sorted_x[i2]<sorted_x[i2+1]: # for a different value
                    # try to maximise this value: it is directly correlated 
                    # to the possible split information gain
                    new_costDescent = (s_left**2)/n_left + (s_right**2)/n_right
                    if new_costDescent > best_costDescent:
                        best_costDescent = new_costDescent
                        best_feature = i1
                        best_cut = (sorted_x[i2]+sorted_x[i2+1])/2
                        
        # update the importance for the selected feature
        feature_importance = np.var(y) - (np.sum(np.square(y))-best_costDescent)/n
        self.feature_importances[best_feature] += feature_importance

        left_idxs = X[:,best_feature] < best_cut
        
        return self.Node(feature=best_feature, feature_importance=feature_importance,
                         cut=best_cut, average=np.mean(y),
                         left = self.learn(X[left_idxs], y[left_idxs], depth-1, minElem_perLeaf),
                         right = self.learn(X[~left_idxs], y[~left_idxs], depth-1, minElem_perLeaf))
    
    def prune(self, X, y):
        # for statistics purposes check errors on different dataset portions and average them
        # in order to decide whether to prune or not (same code of k-fold cross-validation)
        n,_ = X.shape
        folds = 5
        elemPerFold, remainder = np.divmod(n, folds)
        foldsIdxsList = []
        start = 0
        for i in range(folds): 
            end = start+elemPerFold+int(remainder>0)
            foldsIdxsList.append(np.arange(start,end)) 
            remainder -= 1
            start = end
        
        # recursive: start checking if the root receive a possible positive pruning from its sons
        self.test_pruning(self.root, X, y, foldsIdxsList)
        return self
    
    def test_pruning(self, node, X, y, foldIdxs):
        if node.isLeaf: # leaf: start point of new possible pruning
            return True
        
        # check sons response: if one of them is negative to be pruned it means that it performs an important
        # predictive split
        if not self.test_pruning(node.left, X, y, foldIdxs) or not self.test_pruning(node.right, X, y, foldIdxs):
            return False
        
        # else proceed with testing the goodness of the current node split
        folds = len(foldIdxs)
        results = np.empty(folds)

        # not pruned errors on different folds
        for i, idxs in enumerate(foldIdxs):
            pred = self.predict(X[idxs])
            results[i] = Regression_evaluationMetric(true=y[idxs], predicted=pred).rootMeanSquareError()

        not_prunErr = np.mean(results)

        # pruned errors on different folds
        node.isLeaf = True
        for i, idxs in enumerate(foldIdxs):
            pred = self.predict(X[idxs])
            results[i] = Regression_evaluationMetric(true=y[idxs], predicted=pred).rootMeanSquareError()

        # if pruning improves the prediction RMSE then keep current node as leaf
        node.isLeaf = np.mean(results) <= not_prunErr
        
        if node.isLeaf:
            # lower feature importance computed during training phase
            self.feature_importances[node.feature] -= node.feature_importance 
            node.left = None
            node.right = None
            
        return node.isLeaf
            
    def predict(self, X):
        if self.root is None:
            raise Exception("Tree not initialised! need to first fit the model")

        n,_ = X.shape
        y = np.empty(n)
        
        for i in range(n):
            current = self.root
            while not current.isLeaf:
                if X[i,current.feature] < current.cut:
                    current = current.left
                else:
                    current = current.right
                
            y[i] = current.avg
        
        return y
                
    def pprint(self):
        self.root.print_tree_indented()
        
    def print_featureImportances(self):
        print([(k,v) for k,v in sorted(self.feature_importances.items(), key=lambda kv: kv[1], reverse=True)])

In [149]:
ndt = NumericalDecisionTree_regressor()
ndt.fit(trainVal_data, trainVal_values, depth=5, minElem_perLeaf=10)
ndt.pprint()

|    |    |    |    |    Leaf: -0.06859882898361261
|    |    |    |    42 => -0.17021276595744683
|    |    |    |    |    Leaf: -0.3795427870865701
|    |    |    74 => -0.857107044372023
|    |    |    |    Leaf: -0.6101972943249725
|    |    44 => -0.5331179321486268
|    |    |    |    Leaf: 0.1907227549262237
|    |    |    37 => -0.6700290647974012
|    |    |    |    Leaf: 0.766500451240285
|    100 => -0.7800478556966685
|    |    |    |    |    Leaf: -0.4643401529895524
|    |    |    |    18 => -0.46325802615933415
|    |    |    |    |    Leaf: -0.2054652338867468
|    |    |    38 => 0.16768802228412272
|    |    |    |    |    Leaf: -0.35726011021157494
|    |    |    |    96 => 0.7990967365967365
|    |    |    |    |    Leaf: -0.6019697884586385
|    |    100 => -0.9556414503957298
|    |    |    |    Leaf: -0.5748053336160026
|    |    |    41 => 0.26237113402061873
|    |    |    |    |    Leaf: -0.8909879291383445
|    |    |    |    76 => 0.22916666666666663
|    | 

In [150]:
pred = ndt.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.06540013440039279
Root Mean Square Error: 0.11599508981488645
R^2 score: 0.6288329888004036


In [151]:
ndt.print_featureImportances()

[(44, 0.06962163735211466), (37, 0.06215999166988643), (100, 0.06128364452888896), (74, 0.024432119208364744), (42, 0.024153879925208857), (50, 0.023355295640304405), (18, 0.01562792597514728), (38, 0.010954559491003331), (94, 0.007536197202554325), (0, 0.0073930942201939475), (96, 0.005234294939369014), (41, 0.004621503536219432), (91, 0.004310264244827965), (29, 0.004279966735905912), (51, 0.004264621889466011), (76, 0.0027973363054363144), (99, 0.0015393606728029072), (2, 0.001076901202835631), (3, 0.0009037672266171033), (93, 0.0005371235402911339), (40, 0.00036856784794784777), (1, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0), (19, 0), (20, 0), (21, 0), (22, 0), (23, 0), (24, 0), (25, 0), (26, 0), (27, 0), (28, 0), (30, 0), (31, 0), (32, 0), (33, 0), (34, 0), (35, 0), (36, 0), (39, 0), (43, 0), (45, 0), (46, 0), (47, 0), (48, 0), (49, 0), (52, 0), (53, 0), (54, 0), (55, 0), (56, 0), (57, 0), (58, 0), (59

In [152]:
ndt = NumericalDecisionTree_regressor()
ndt.fit(trainVal_data, trainVal_values, depth=100, minElem_perLeaf=10, pruning=True)
ndt.pprint()

|    |    |    |    Leaf: 0.19314924496384922
|    |    |    51 => -0.8367909142541375
|    |    |    |    |    Leaf: -0.646243811088586
|    |    |    |    52 => -0.24731684554363048
|    |    |    |    |    |    Leaf: -0.08567087578893451
|    |    |    |    |    17 => -0.3599910394265233
|    |    |    |    |    |    |    Leaf: -0.27978075463864716
|    |    |    |    |    |    101 => -0.5957886044591247
|    |    |    |    |    |    |    Leaf: -0.40994680391269817
|    |    69 => 0.3112267013437364
|    |    |    |    |    |    Leaf: -0.68162899951269
|    |    |    |    |    71 => -0.8584454530718106
|    |    |    |    |    |    Leaf: -0.8320970968981463
|    |    |    |    44 => 0.22065158858373723
|    |    |    |    |    |    |    Leaf: -0.5084631477285609
|    |    |    |    |    |    41 => 0.12268041237113414
|    |    |    |    |    |    |    Leaf: -0.6319842254837154
|    |    |    |    |    87 => -0.515451174289246
|    |    |    |    |    |    |    Leaf: -0.6670630661666

In [153]:
pred = ndt.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.062098107613760584
Root Mean Square Error: 0.12414235461417542
R^2 score: 0.5748618135584248


In [154]:
ndt.print_featureImportances()

[(3, 0.08437559577034112), (100, 0.0719042140165036), (51, 0.05852751161056811), (50, 0.028449060341569465), (69, 0.023300662350671436), (49, 0.022838974518344147), (99, 0.01753242368465365), (38, 0.014861822507211856), (40, 0.01414864455260557), (18, 0.011671790902563834), (52, 0.01151880208301218), (44, 0.009497764506940141), (41, 0.007625530995009112), (17, 0.00585882344972834), (14, 0.00525367998654249), (101, 0.005179557155882869), (9, 0.005150846144585731), (71, 0.004923200784036352), (74, 0.004450117218989524), (2, 0.004295468469618582), (10, 0.00380481445178369), (86, 0.003765862203045354), (87, 0.0036370332167295315), (43, 0.0032514493548001545), (4, 0.003220307316960427), (34, 0.0029576817075927355), (89, 0.0027264630517829652), (63, 0.0026430998865404065), (68, 0.002213887521839729), (92, 0.0018548821129251252), (35, 0.0017368667150074425), (90, 0.001709278979930168), (36, 0.0016885144270813615), (24, 0.001526559610022017), (1, 0.0012000668319733843), (8, 0.00111978352565845

### Decision Tree SkLearn Class

In [156]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(trainVal_data, trainVal_values)
np.flip(np.argsort(dtr.feature_importances_))

array([ 50, 100,  44,  51,   3,  74,   6,  42,  37,  66,  38,  40,  18,
        91,   2,  99,  41,   0,  93,  46,  96,  49,  73,  78,  36,   5,
        29,  14,  28,  88,  23,  98,  26,  52,   8,  24,  31,  95,  34,
        89,  11,  58,  76,  43,  59,   7,   9,  94,  82,  71,  86,  77,
        25,  45,   4,  69,  90,  83,  12,  72,  27,  48,  13,  22,  64,
        60,  67,  85,  92,   1,  57,  39,  75,  47,  21,  17,  55,  79,
        53,  65,  56,  35,  87,  10,  16,  15,  97,  32,  54,  62,  20,
        30,  63,  80,  33,  68,  61,  19,  81,  84,  70, 101])

In [121]:
pred = dtr.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.07434363826362839
Root Mean Square Error: 0.12263888489662965
R^2 score: 0.5850970289177095


### Random Forest project class definition

In [288]:
class NumericalRandomForest_regressor: # post-pruning (kinda) with cross-validation or greedy 
    def __init__(self, n_trees):
        self.n_trees = n_trees
        self.trees = []
        self.bootSubSp_samplesIdxs = []
        self.oob_error = None
        self.feature_importances = None
        
    def fit(self, X, y, depth, minElems_perLeaf):
        n,d = X.shape
        
        self.feature_importances = {k:0 for k in range(d)}
        
        n_learn = int(n/3) # Bootstrap amount to be taken aside
        d_learn = int(np.sqrt(d)) # random subspace method amount
        
        #val_folds = 3
        #params_dict = {"depth":depths, "minElem_perLeaf":minElems_perLeaf}
        
        # Fitting the forest -----------------------------------
        for i in range(self.n_trees):
            print("Fitting #{} tree".format(i+1))
            
            bootstrap_idxs = np.sort(np.random.permutation(n)[:n_learn])
            subspace_idxs = np.sort(np.random.permutation(d)[:d_learn])
            self.bootSubSp_samplesIdxs.append((bootstrap_idxs, subspace_idxs))
                        
            dt = NumericalDecisionTree_regressor()
            
            # find the best hyp-par for the current setting (bootstrapping)
            #win_params = kFold_crossValidation_selectionGrid(val_folds, params_dict, 
            #                                                 X[~bootstrap_idxs], y[~bootstrap_idxs],
            #                                                 dt, verbose=True)
            #self.trees.append(dt.fit(X[~bootstrap_idxs], y[~bootstrap_idxs],
            #                         depth=win_params[0], minElem_perLeaf=win_params[1]))
            
            self.trees.append(dt.fit(X[~bootstrap_idxs][:,subspace_idxs], y[~bootstrap_idxs],
                                     depth=depth, minElem_perLeaf=minElems_perLeaf, pruning=True))
            
            for k,v in dt.feature_importances.items():
                self.feature_importances[subspace_idxs[k]] += v
        
        # Out-Of-Bag Estimate for the forest -------------------
        oob_errors = []
        for sampleIdx in range(n):
            missingBoot_TreesIdx = [(idx,subspace_idxs) for idx,(bootstrap_idxs,subspace_idxs) in enumerate(self.bootSubSp_samplesIdxs) 
                                    if sampleIdx not in bootstrap_idxs]

            if len(missingBoot_TreesIdx) == 0: continue
            
            regr_results = np.empty(len(missingBoot_TreesIdx)) # regression estimate of the selected trees
            for i, (missing_tree, tree_subSpace) in enumerate(missingBoot_TreesIdx):
                # reshape in order to correctly use the decision_tree.predict(...): it needs a matrix (num,dim)
                # while numpy matrix indexing returns (dim,)
                regr_results[i] = self.trees[missing_tree].predict(X[sampleIdx,tree_subSpace].reshape(1,-1))
            
            # done at this level of granularity because a sample might end up in 
            # being part of no bootstrap set of any tree (so we cannot predict wich value in y will be used)
            oob_errors.append(np.square(y[sampleIdx]-np.mean(regr_results)))
            #oob_errors.append(r2_score(np.mean(regr_results),y[sampleIdx]))
            #oob_errors.append(explained_variance_score(np.mean(regr_results),y[sampleIdx]))
            
        self.oob_error = np.sqrt(np.mean(oob_errors))
        return self
            
        
    def predict(self,X):
        if len(self.trees)==0:
            raise Exception("Trees not initialised! need to first fit the model")

        n,_ = X.shape
        results = np.empty((self.n_trees,n))
        for row, (tree,(_,subspace_idxs)) in enumerate(zip(self.trees, self.bootSubSp_samplesIdxs)):
            results[row] = tree.predict(X[:,subspace_idxs])
            
        return np.mean(results,axis=0)
    
    def print_featureImportances(self):
        print([(k,v) for k,v in sorted(self.feature_importances.items(), key=lambda kv: kv[1], reverse=True)])

In [289]:
a = np.random.normal(size=(5,6))

In [290]:
a[1,[1,2]].shape

(2,)

In [291]:
nrf = NumericalRandomForest_regressor(3)
# no train and test, cause it's a forest
nrf.fit(data, values, depth=100, minElems_perLeaf=5);
nrf.oob_error

Fitting #1 tree
Fitting #2 tree
Fitting #3 tree


0.13055868094831646

In [292]:
nrf.print_featureImportances()

[(99, 0.15566421653666093), (3, 0.13049725352514754), (57, 0.11493047410018982), (34, 0.11322429125451826), (51, 0.1124435063719215), (26, 0.10047303639577089), (23, 0.09100090823803786), (2, 0.0879900563867197), (12, 0.083404047467246), (14, 0.07836381086034548), (21, 0.07694193082487932), (47, 0.06306424837671218), (76, 0.05937313885296492), (43, 0.05530174488231513), (66, 0.049198681127203894), (77, 0.047164211684424), (73, 0.044430035564181763), (37, 0.04200230295640027), (8, 0.038138077527618365), (40, 0.026636182973730108), (52, 0.026078021628253574), (48, 0.01921726533042986), (13, 0.01698531580510679), (90, 0.014093436553970752), (28, 0.00648772489582577), (60, 0.005963099507035082), (59, 0.0034920098813538522), (53, 0.0034677762544725035), (95, 0.001961266998716618), (0, 0), (1, 0), (4, 0), (5, 0), (6, 0), (7, 0), (9, 0), (10, 0), (11, 0), (15, 0), (16, 0), (17, 0), (18, 0), (19, 0), (20, 0), (22, 0), (24, 0), (25, 0), (27, 0), (29, 0), (30, 0), (31, 0), (32, 0), (33, 0), (35,

### Random Forest SkLearn class

In [293]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()
# oob error not working, need to perform evaluation on test split
rfr.fit(trainVal_data, trainVal_values.ravel())
np.flip(np.argsort(rfr.feature_importances_))



array([ 50, 100,   3,  49,  46,  44,  10,  99,  51,   2,  68,  38,  78,
        40,  65,  93,  32,  69,  83,  15,  14,  27,  84,  41,  20,  34,
        96,  66,  59,  71,  74,  94,   6,  39,  70,  92,   4,  73,  75,
        89,  28,  43,  90,  33,  61,  72,  35,  13,  30,  25,  88,  22,
        76,  47,   5,  53,  86,   1,  67,  23,  62,   8,  29,  60,  17,
         7, 101,  16,  52,  95,  45,  54,  24,  48,  56,  11,  63,  37,
        77,  12,   0,  82,  80,  26,  36,  18,  57,  55,  21,  87,  19,
        98,  97,  31,  42,  91,  64,  81,  85,   9,  79,  58])

In [294]:
pred = rfr.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.06386462647128814
Root Mean Square Error: 0.09876585539075922
R^2 score: 0.7309061219268678


# Predictors

## 1. Regularised Least Squares
   

In [295]:
class tikhonov_leastSquares:
    def __init__(self, weights = None):
        self.weights = weights
        
    def fit(self, X, y, _lambda):
        inv = np.linalg.inv(np.matmul(X.T, X) + _lambda*np.eye(X.shape[1]))
        self.weights = np.matmul(inv, np.matmul(X.T, y))
        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [296]:
k = 5
params_dict = {"_lambda":[2,2.05,2.1,2.2,3]}

tls = tikhonov_leastSquares()

win_regulariser = kFold_crossValidation_selectionGrid(k, params_dict, trainVal_data, trainVal_values, tls)
tls.fit(trainVal_data, trainVal_values, win_regulariser)
pred = tls.predict(test_data)

rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.010685591460581137
Root Mean Square Error: 0.10352228359071869
R^2 score: 0.704363640444624


## 2. Random Forest

In [299]:
rf = NumericalRandomForest_regressor(100)
rf.fit(trainVal_data, trainVal_values, depth=100, minElems_perLeaf=10);

pred = rf.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Fitting #1 tree
Fitting #2 tree
Fitting #3 tree
Fitting #4 tree
Fitting #5 tree
Fitting #6 tree
Fitting #7 tree
Fitting #8 tree
Fitting #9 tree
Fitting #10 tree
Fitting #11 tree
Fitting #12 tree
Fitting #13 tree
Fitting #14 tree
Fitting #15 tree
Fitting #16 tree
Fitting #17 tree
Fitting #18 tree
Fitting #19 tree
Fitting #20 tree
Fitting #21 tree
Fitting #22 tree
Fitting #23 tree
Fitting #24 tree
Fitting #25 tree
Fitting #26 tree
Fitting #27 tree
Fitting #28 tree
Fitting #29 tree
Fitting #30 tree
Fitting #31 tree
Fitting #32 tree
Fitting #33 tree
Fitting #34 tree
Fitting #35 tree
Fitting #36 tree
Fitting #37 tree
Fitting #38 tree
Fitting #39 tree
Fitting #40 tree
Fitting #41 tree
Fitting #42 tree
Fitting #43 tree
Fitting #44 tree
Fitting #45 tree
Fitting #46 tree
Fitting #47 tree
Fitting #48 tree
Fitting #49 tree
Fitting #50 tree
Fitting #51 tree
Fitting #52 tree
Fitting #53 tree
Fitting #54 tree
Fitting #55 tree
Fitting #56 tree
Fitting #57 tree
Fitting #58 tree
Fitting #59 tree
Fittin

In [300]:
rf.oob_error

0.09992977321704354

In [301]:
rf.print_featureImportances()

[(30, 1.2178680227941372), (39, 0.6701647017143142), (49, 0.6659491660238277), (44, 0.6382392500428182), (50, 0.6225635842194808), (62, 0.6218716434568194), (100, 0.5572697943487023), (71, 0.5560475815877705), (27, 0.509087921641871), (34, 0.5068620548994166), (45, 0.49411111055825846), (43, 0.4917319479706463), (91, 0.43180366015995286), (40, 0.4313690956894794), (67, 0.41333198252668646), (73, 0.4123353935077691), (0, 0.3881577499798254), (2, 0.381733763164473), (51, 0.37671500836115956), (29, 0.37548844897422123), (14, 0.37230806386790255), (68, 0.35467432904184865), (12, 0.32640732654711296), (3, 0.322538820036979), (63, 0.32083296700478364), (74, 0.31307657002969724), (99, 0.29823137874641725), (94, 0.2760106153577257), (69, 0.27337023063383514), (28, 0.26955794730674826), (78, 0.2621264762543446), (93, 0.2533951070725696), (26, 0.25077491854916667), (19, 0.24619182029119308), (54, 0.24183945314492106), (21, 0.2395214577318557), (53, 0.23637711194631805), (88, 0.23190499277464596)

## 3. SVM

In [183]:
class linear_SupportVector_regression:
    def __init__(self, weight=None, alpha=None, bias=None):
        self.x = alpha
        self.w = weight
        self.bias = bias
        self.Nabla = None
                
    def SMO2_ab(self, n, H, f, a, LB, UB, maxiter, eps, alpha_s):
        """
        % min_{x} .5 x H x + f' x 
        %         LB <= x <= UB
        %         a' x = b
        % n         grandezza problema length(x)
        % maxiter   max num it
        % eps       precisione
        % alpha_s   punto di inizio valido per x
        % Nabla     ....
        % err       flag di ok
        % x         valore della soluzione ottima
        % bias      ....
        """
        self.x = alpha_s
        self.Nabla = f
        for i in range(n):
            if (self.x[i] != 0.0):
                for j in range(n):
                    self.Nabla[j] += H[j,i] * self.x[i]
        iter_ = 0
        while True:
            minF_up = float("inf");
            maxF_low = float("-inf");
            for i in range(n): 
                F_i = self.Nabla[i]/a[i]
                if (LB[i] < self.x[i]) and (self.x[i] < UB[i]) :
                    if (minF_up > F_i):
                        minF_up = F_i
                        u = i
                    if (maxF_low < F_i):
                        maxF_low = F_i
                        v = i
                elif (((a[i] > 0) and (self.x[i] == LB[i])) or ((a[i] < 0) and (self.x[i] == UB[i]))) : 
                    if (minF_up > F_i):
                        minF_up = F_i
                        u = i
                elif (((a[i] > 0) and (self.x[i] == UB[i])) or ((a[i] < 0) and (self.x[i] == LB[i]))) : 
                    if (maxF_low < F_i):
                        maxF_low = F_i
                        v = i
            if ((maxF_low - minF_up) <= eps):
                err = 0.0
                break

            iter_ += 1
            if (iter_ >= maxiter):
                err = 1.0
                break

            if (a[u] > 0):
                tau_lb = (LB[u]-self.x[u])*a[u] 
                tau_ub = (UB[u]-self.x[u])*a[u] 
            else:
                tau_ub = (LB[u]-self.x[u])*a[u] 
                tau_lb = (UB[u]-self.x[u])*a[u]

            if (a[v] > 0):
                tau_lb = max(tau_lb,(self.x[v]-UB[v])*a[v]) 
                tau_ub = min(tau_ub,(self.x[v]-LB[v])*a[v]) 
            else:
                tau_lb = max(tau_lb,(self.x[v]-LB[v])*a[v]) 
                tau_ub = min(tau_ub,(self.x[v]-UB[v])*a[v])

            tau = (self.Nabla[v]/a[v]-self.Nabla[u]/a[u])/(H[u,u]/(a[u]*a[u])
                                                           +H[v,v]/(a[v]*a[v])
                                                           -2*H[v,u]/(a[u]*a[v]))
            tau = min(max(tau,tau_lb),tau_ub)
            self.x[u] += tau/a[u]
            self.x[v] -= tau/a[v]

            for i in range(n):
                self.Nabla[i] += H[u,i]*tau/a[u] - H[v,i]*tau/a[v]

        tsv = 0
        self.bias = 0.0

        for k in range(n):
            if ((self.x[k] > LB[k]) and (self.x[k] < UB[k])):
                self.bias -= self.Nabla[k]/a[k]
                tsv += 1

        if (tsv > 0):
            self.bias /= tsv
        else:    
            self.bias = -(maxF_low + minF_up)/2.0

        return err
    
    def fit(self, X, y, C):
        n = X.shape[0]
        cov = np.matmul(X, X.T)
        Q = np.matmul(np.matmul(np.diag(y.flatten()), cov),
                      np.diag(y.flatten()))
        
        if self.SMO2_ab(n,Q,-np.ones(n),y.flatten(),
                   np.zeros(n),C*np.ones(n),10000000,.0001,np.zeros(n)):
            print("Problem in SMO")
            
        self.w = np.matmul(np.matmul(X.T, np.diag(y.flatten())),
                           self.x)
        
        return self
    
    def predict(self, X):
        return np.matmul(X, self.w) + self.bias

In [184]:
lsvr = linear_SupportVector_regression()
lsvr.fit(trainVal_data, trainVal_values, C=1.0);

pred = lsvr.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 5.944963947364811
Root Mean Square Error: 9.099746716634384
R^2 score: -2283.277002086332


In [185]:
np.where(lsvr.x)

(array([  63,   84,   95,   98,  109,  110,  148,  230,  278,  299,  356,
         364,  367,  404,  435,  482,  536,  564,  580,  598,  609,  622,
         684,  716,  811,  832,  833,  900,  909,  993, 1004, 1020, 1091,
        1097, 1113, 1115, 1188, 1204, 1255, 1303, 1318, 1433, 1449, 1479,
        1493]),)

In [186]:
from sklearn.svm import SVR
svr = SVR(kernel="linear", tol=.0001, C=1)
svr.fit(trainVal_data, trainVal_values.flatten());

In [187]:
np.where(svr.dual_coef_)[1]

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [188]:
pred = svr.predict(test_data)
rem = Regression_evaluationMetric(test_values, pred)

print("Residual variance: {}".format(np.var(test_values-pred)))

print("Root Mean Square Error: {}".format(rem.rootMeanSquareError()))
print("R^2 score: {}".format(rem.rSquared()))

Residual variance: 0.07147236509248299
Root Mean Square Error: 0.11212491216741184
R^2 score: 0.6531877808500957
