In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import r2_score, explained_variance_score

#import os
#import seaborn as sns
#import matplotlib.pyplot as plt
#%matplotlib inline

# Read Dataset

In [2]:
cmp = pd.read_csv("commViolUnnormData.txt", na_values='?')

In [3]:
# drop first non predictive features (communityname, state, countyCode, communityCode, "fold")
pred_features = cmp[cmp.columns[5:-18]]
regr_values = cmp[cmp.columns[-18:]]

# Drop features with a lot of missing values

In [4]:
print("Before dropping: {} features".format(str(pred_features.shape[1])))

#drop features that contain at least some threshold (from the total) of NaN values
cut_tresh = 0.75
to_drop = pred_features.columns[pred_features.count() < pred_features.shape[0]*cut_tresh]

pred_features = pred_features.drop(columns=to_drop)

print("After dropping: {} features".format(str(pred_features.shape[1])))

Before dropping: 124 features
After dropping: 102 features


# Imputing on features matrix

In [5]:
from collections import Counter

def value_withStrategy(v, strat):
    if strat == "mean":
        return np.mean(v)
    if strat == "median":
        return np.median(v)
    if strat == "most_frequent":
        return Counter(v).most_common(1)[0][0]
    print("Invalid imputing strategy!")
        
def imputing(df, strategy):
    nanRows, nanCols = np.where(df.isna())
    for j in nanCols:
        available = df.iloc[~nanRows, j]
        value = value_withStrategy(available, strategy)
        df.iloc[nanRows,j] = value

In [6]:
imputing(pred_features, "mean");

# Choose the Dependent Variable and drop possible missing values on it

In [7]:
def drop_sample(df, vals):
    idxRow = np.where(vals.isna())[0]
    return df.drop(index=idxRow).values, vals.drop(index=idxRow).values.reshape(-1,1)

In [8]:
data,values = drop_sample(pred_features, regr_values["robbPerPop"])

# Normalisation

In [9]:
def normalise(matrix, strat):
    for j in range(matrix.shape[1]):
        mi = np.min(matrix[:,j])
        ma = np.max(matrix[:,j])
        di = ma-mi
        if (di > 1e-6):
            if strat=="0_mean,1_std":
                matrix[:,j] = (matrix[:,j]-np.mean(matrix[:,j]))/np.std(matrix[:,j])
            elif strat=="[0,1]":
                matrix[:,j] = (matrix[:,j]-mi)/di
            elif strat=="[-1,1]":
                matrix[:,j] = 2*((matrix[:,j]-mi)/di)-1
            else:
                print("Invalid normalisation strategy!")
        else:
            matrix[:,j] = 0

In [10]:
strategy = "[-1,1]"
normalise(data,strategy)
normalise(values,strategy)

# Train-Test Split

In [11]:
n = data.shape[0]

trVl_Amount = int(n*0.7)
indexes = np.random.permutation(n)
idxTrVl = np.sort(indexes[0:trVl_Amount])
idxTs = np.sort(indexes[trVl_Amount:])

trainVal_data = data[idxTrVl]
test_data = data[idxTs]
trainVal_values = values[idxTrVl]
test_values = values[idxTs]

# Variable Selection

## 1. Matching Pursuit

### Project class definition

In [12]:
class matchingPursuit:
    def __init__(self, iterations, weights = None, indexes = None):
        self.iterations = iterations
        self.weights = weights
        self.indexes = indexes
        
    def fit(self, data_matrix, output_vect):
        residual = output_vect.copy()
        self.weights = np.zeros((data_matrix.shape[1], 1))
        self.indexes = []

        #data_2norm = np.sqrt(np.sum(np.square(data_matrix), axis=0))
        data_2norm = np.linalg.norm(data_matrix, ord=2, axis=0).reshape(1,-1)

        for i in range(self.iterations):
            projection = np.matmul(residual.T, data_matrix)
            k = np.argmax(np.divide(np.square(projection), data_2norm))
            self.indexes.append(k)

            distance = projection[0,k]/np.linalg.norm(data_matrix[:,k], ord=2)
            self.weights[k,0] += distance
            residual -= np.matmul(data_matrix, self.weights)

        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [13]:
mp = matchingPursuit(iterations=10)
mp.fit(trainVal_data, trainVal_values)
np.where(mp.weights)[0]

array([92])

In [14]:
pred = mp.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 2.163497390032889e+31
R^2 score: -4.8861325862267744e+32
Explained Variance Score: -3.8568662080651845e+29


### SkLearn Class

In [15]:
from sklearn.linear_model import orthogonal_mp
omp_coef = orthogonal_mp(trainVal_data, trainVal_values)
np.where(omp_coef)[0]

array([ 3, 11, 38, 48, 50, 74, 76, 77, 92, 94])

In [16]:
pred = np.matmul(test_data, omp_coef)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.07356269296755365
R^2 score: 0.668290263545612
Explained Variance Score: 0.6693114540463263


## 2. L1 Penalty (Lasso)

### Project class definition

In [17]:
class lasso_regression:
    def __init__(self, iterations, weights=None):
        self.iterations = iterations
        self.weights = weights
        
    def fit(self, data_matrix, output_vect, _lambda):
        self.weights = np.zeros((data_matrix.shape[1],1))
        n = float(data_matrix.shape[0])
        step = n/(2*np.linalg.norm(np.matmul(data_matrix.T, data_matrix), ord=2))
        softTresh = step*_lambda

        for i in range(self.iterations):
            dist = np.matmul(data_matrix, self.weights) - output_vect
            coord_descent = (step/n)*np.matmul(data_matrix.T, dist)
            self.weights -= coord_descent

            upper = self.weights > softTresh
            lower = self.weights < -softTresh

            self.weights[upper] -= softTresh
            self.weights[lower] += softTresh
            self.weights[~upper & ~lower] = 0

        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [18]:
lr = lasso_regression(iterations=10)
lr.fit(trainVal_data, trainVal_values, 0.8)
np.where(lr.weights)[0]

array([  0,  10,  27,  49,  51,  71,  91,  92,  98, 101])

In [19]:
pred = lr.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.7144181261141582
R^2 score: -15.13471641925296
Explained Variance Score: 0.005766194848579209


### SkLearn Class

In [20]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.005)
lasso.fit(trainVal_data, trainVal_values)
np.where(lasso.coef_)[0]

array([  3,  11,  38,  44,  50,  69,  74,  76,  94, 100])

In [21]:
pred = lasso.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.06650733267764729
R^2 score: 0.6407210418367291
Explained Variance Score: 0.641124679881149


## 3. Random Forest

### Decision Tree project class definition

In [22]:
class NumericalDecisionTree_regression:
    class Node:
        def __init__(self, value, isLeaf=False, feature=None, left=None, right=None):
            self.value = value
            self.isLeaf = isLeaf
            self.feature = feature
            self.left = left
            self.right = right

        def print_tree(self):
            if self.left: self.left.print_tree()
            print("Feature: {}, cut: {}\n".format(self.feature, self.value))
            if self.right: self.right.print_tree()

        def print_tree_indented(self, level=0):
            if self.right: self.right.print_tree_indented(level+1)
            print('\t'*level+"{} => {}".format(self.feature, self.value))
            if self.left: self.left.print_tree_indented(level+1)
            
    def __init__(self):
        self.root = None
        
    def fit(self, X, y, depth, minElem_perLeaf):        
        self.root = self.learn(X, y, depth, minElem_perLeaf)
        return self
        
    def learn(self, X, y, depth, minElem_perLeaf):      
        n, d = X.shape

        if depth==0 or n<=minElem_perLeaf: #or other condition
            return self.Node(value=np.mean(y), isLeaf=True)
            
        best_costDescent = 0 #split that maximise the error descent

        for i1 in range(d):
            sorted_idx = np.argsort(X[:,i1])
            sorted_x, sorted_y = X[sorted_idx,i1], y[sorted_idx]

            s_right, s_left = np.sum(sorted_y), 0
            n_right, n_left = n, 0

            for i2 in range(n-1):
                s_left += sorted_y[i2]
                s_right -= sorted_y[i2]
                n_left += 1
                n_right -= 1

                if sorted_x[i2]<sorted_x[i2+1]:
                    new_costDescent = (s_left**2)/n_left + (s_right**2)/n_right
                    if new_costDescent > best_costDescent:
                        best_costDescent = new_costDescent
                        best_feature = i1
                        best_cut = (sorted_x[i2]+sorted_x[i2+1])/2

        left_idxs = X[:,best_feature] < best_cut

        return self.Node(value=best_cut, feature=best_feature,
                        left=self.learn(X[left_idxs],y[left_idxs],depth-1,minElem_perLeaf),
                        right=self.learn(X[~left_idxs],y[~left_idxs],depth-1,minElem_perLeaf))
    
    def predict(self, X):
        n = X.shape[0]
        y = np.zeros(n)
        
        for i in range(n):
            current = self.root
            while not current.isLeaf:
                if X[i,current.feature] < current.value:
                    current = current.left
                else:
                    current = current.right
                
            y[i] = current.value
        
        return y
                
    def pprint(self):
        self.root.print_tree_indented()

In [23]:
dt = NumericalDecisionTree_regression()
dt.fit(trainVal_data, trainVal_values, depth=3, minElem_perLeaf=1)
dt.pprint()

			None => 0.5287888357411749
		100 => -0.3436407141542426
			None => -0.1669125969718251
	50 => 0.026325411334552018
			None => -0.5177077695660993
		100 => -0.9379716547027425
			None => -0.768165576743706
50 => -0.6329067641681901
			None => -0.8224806053982927
		73 => -0.7151827200803716
			None => -0.019698515544602224
	50 => -0.7736745886654479
			None => -0.8384565441845723
		51 => -0.9924216637196692
			None => -0.9490712734109973


In [24]:
pred = dt.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.07585356691562414
R^2 score: 0.531303436542665
Explained Variance Score: 0.5313166120306662


### Decision Tree SkLearn Class

In [25]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(trainVal_data, trainVal_values)
np.flip(np.argsort(dtr.feature_importances_))

array([ 50, 100,   3,  66,  92,   2,  51,  71,  52,  88,  10,  26,  46,
        99,  37,  58,   6,  86,   7,  34,   0,  93,  40,  42,  56,  18,
        27,  87,  43,  35,  55,  73,  38,  39,  75,  83,  15,  48,  96,
        17,  69,  61,  25,  64,  91,  76,  97,  23,  44,  13,  19,  89,
        22,  45,  36,  95,   8,  32,  74,  90,  57,   9,  21,  94,  84,
         5,  41,  78,  68,  82,  29,  16,  49,  65,  14, 101,  24,  77,
        47,  11,  80,  28,  98,  31,  33,  60,   4,  72,  54,  67,  70,
         1,  59,  79,  12,  20,  30,  63,  62,  53,  85,  81])

In [26]:
pred = dtr.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.08746174867533872
R^2 score: 0.6186693471494426
Explained Variance Score: 0.619272778810366


### Random Forest project class definition

### Random Forest SkLearn class

In [27]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()
rfr.fit(trainVal_data, trainVal_values.ravel())
np.flip(np.argsort(rfr.feature_importances_))



array([ 50, 100,  49,  51,   3,  46,  92,  99,   2,  44,  74,  40,  14,
         4,  78,  41,  59,  32,  63,  93,  71,   6,  17,  96,  48,  69,
        15,  13,  37,  31,   1,  36,  54,  61,  30,  73,   0,  72,  66,
        27,  98,  58,  35,  25,  24,  10,  65,   7,  67,  88,  89,  18,
        20,   8,  91,  23,  77,  90,  39,  80,  97,  28,  38,  21,  60,
        47,  22,  43,  82,  42,  26,  62,   5,  94,   9,  52,  87,  33,
        57,  68,  16, 101,  76,  84,  56,  45,  95,  75,  53,  34,  86,
        29,  19,  11,  55,  79,  64,  83,  85,  12,  81,  70])

In [28]:
pred = rfr.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.07471129166374027
R^2 score: 0.7314293001541479
Explained Variance Score: 0.731829832097239


# Predictors

## 0. K-fold Cross Validation

In [29]:
from itertools import product

def crossValidation_selectionGrid(k, parameters_dict, train_data, train_values, predictor):
    nVal = train_data.shape[0]
    
    # Validation indexes adjustment
    elemPerFold, remainder = np.divmod(nVal,k)
    valIdxList = []
    start = 0

    for i in range(k):
        end = start+elemPerFold+int(remainder>0)
        valIdxList.append(np.arange(start,end)) 
        remainder -= 1
        start = end
    
    # Cross validation
    params_names = parameters_dict.keys()
    params_product = list(product(*parameters_dict.values()))
    val_results = np.empty((len(valIdxList),len(params_product)))
    
    for rowVal, valIdx in enumerate(valIdxList):
        for colVal, params in enumerate(params_product):
                     
            arg_dict = {k:v for k,v in zip(params_names,params)}
            
            predictor.fit(train_data[~valIdx], train_values[~valIdx], **arg_dict)
            pred = predictor.predict(train_data[valIdx])
            
            #val_results[rowVal,colVal] = r2_score(trainVal_values[valIdx],pred)
            #val_results[rowVal,colVal] = explained_variance_score(trainVal_values[valIdx],pred)
            val_results[rowVal,colVal] = np.mean(np.square(train_values[valIdx]-pred))
            
    selected = np.argmin(val_results.mean(axis=0))
    return params_product[selected]

## 1. Regularised Least Squares
   

In [30]:
class tikhonov_leastSquares:
    def __init__(self, weights = None):
        self.weights = weights
        
    def fit(self, X, y, _lambda):
        inv = np.linalg.inv(np.matmul(X.T, X) + _lambda*np.eye(X.shape[1]))
        self.weights = np.matmul(inv, np.matmul(X.T, y))
        return self
    
    def predict(self, X):
        if self.weights is None:
            raise Exception("weights not initialised! need to first fit the model")
        return np.matmul(X, self.weights)

In [31]:
k = 5
params_dict = {"_lambda":[2,2.05,2.1,2.2,3]}

tls = tikhonov_leastSquares()

win_regulariser = crossValidation_selectionGrid(k, params_dict, trainVal_data, trainVal_values, tls)
tls.fit(trainVal_data, trainVal_values, win_regulariser)
pred = tls.predict(test_data)

print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

Mean Square Error: 0.01245992452487423
R^2 score: 0.7185998766470033
Explained Variance Score: 0.7193535535276633


## 2. Random Forest

In [32]:
k = 3 
params_dict = {"depth":[10,15,20,30],"minElem_perLeaf":[5,10,20,30]}

dt = NumericalDecisionTree_regression()
win_params = crossValidation_selectionGrid(k, params_dict, trainVal_data, trainVal_values, dt)
dt.fit(trainVal_data, trainVal_values, win_regulariser, depth=win_params[0], minElem_perLeaf=win_params[1])
pred = dt.predict(test_data)
print("Mean Square Error: {}".format(np.mean(np.square(test_values-pred))))
print("R^2 score: {}".format(r2_score(test_values, pred)))
print("Explained Variance Score: {}".format(explained_variance_score(test_values, pred)))

TypeError: fit() got multiple values for argument 'depth'