In [20]:
from sklearn.datasets import load_boston
from sklearn.linear_model import (LinearRegression, Ridge, 
                                  Lasso, RandomizedLasso)
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import sklearn
import numpy as np
import pandas as pd

In [10]:
np.random.seed(0)
 
size = 750
X = np.random.uniform(0, 1, (size, 14))
 
#"Friedamn #1” regression problem
Y = (10 * np.sin(np.pi*X[:,0]*X[:,1]) + 20*(X[:,2] - .5)**2 +
     10*X[:,3] + 5*X[:,4] + np.random.normal(0,1))
#Add 3 additional correlated variables (correlated with X1-X3)
X[:,10:] = X[:,:4] + np.random.normal(0, .025, (size,4))
 
names = ["x%s" % i for i in range(1,15)]
X.shape

(750, 14)

In [5]:
x_train,x_val,x_test = X[:400],X[400:600],X[600:750]
y_train,y_val,y_test = Y[:400],Y[400:600],Y[600:750]

In [6]:
rf = RandomForestRegressor()
rf.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [7]:
def get_scores(model,x_test,y_test):
    '''
    Takes in x_test and y_test and produces precision / recall scores for the random forest
    
    Parameters:
    -----------
    x_test, y_test -- numpy arrays
    
    Returns: 
    --------
    precision_score, recall_score -- int
    '''

    forest_test_pred = model.predict(x_test)    
    
    precision_score = sklearn.metrics.precision_score(y_test, forest_test_pred)
    recall_score = sklearn.metrics.recall_score(y_test, forest_test_pred)
    y_scores_forest = model.predict_proba(x_test)[:,1]
    
    roc_score = sklearn.metrics.roc_auc_score(y_test, y_scores_forest)
    return precision_score, recall_score, roc_score

def generate_feature_scores(model, base_precision_score, base_recall_score, base_roc_score, as_percentage=False):
    scores = []
    
    for i in range(30):
        for i in range(x_test.shape[1]):
            x_t = x_test.copy()
            np.random.shuffle(x_t[:,i])
            shuff_precision_score, shuff_recall_score, shuff_roc_score = get_scores(model, x_t,y_test)
            
            recall_score = base_recall_score - shuff_recall_score
            precision_score = base_precision_score - shuff_precision_score
            roc_score = base_roc_score - shuff_roc_score
            
            if as_percentage:
                recall_score = recall_score / base_recall_score
                precision_score = precision_score / base_precision_score
                roc_score = roc_score / base_roc_score
            
            
            scores.append({'feature':ordered_feature_names[i],'recall_score':recall_score,'precision_score':precision_score, 'roc_score':roc_score})
        print((i/30)*100)
    df = pd.DataFrame(scores)
    return df 


In [27]:
def get_regression_scores(model,x_test,y_test):
    '''
    Takes in x_test and y_test and produces precision / recall scores for the random forest
    
    Parameters:
    -----------
    x_test, y_test -- numpy arrays
    
    Returns: 
    --------
    accuracy -- int
    '''

    
    return sklearn.metrics.r2_score(y_test, rf.predict(x_test))

def generate_regression_feature_scores(model, ordered_feature_names,base_accuracy, as_percentage=False):
    scores = []
    
    for i in range(30):
        for i in range(x_test.shape[1]):
            x_t = x_test.copy()
            np.random.shuffle(x_t[:,i])
            shuff_acc = get_regression_scores(model, x_t,y_test)
            
            accuracy_score = base_accuracy - shuff_acc
            
            if as_percentage:
                accuracy_score = accuracy_score / base_accuracy
            
            
            scores.append({'feature':ordered_feature_names[i],'accuracy':accuracy_score})
    df = pd.DataFrame(scores)
    return df 



In [28]:
#print(y_test)
forest_test_pred = rf.predict(x_test)  
#print(forest_test_pred)

base_accuracy_score = get_regression_scores(rf,x_test,y_test)
print("Base Model: Accuracy {0}".format(base_accuracy_score))

important_features = pd.Series(data=rf.feature_importances_,index=names)
important_features.sort_values(ascending=False,inplace=True)

print ('---------------------------------------------------')
print(important_features.head(10))
    
print('---------------------------------------------------')
df = generate_regression_feature_scores(rf, names, base_accuracy_score, as_percentage = False)
df = df.groupby('feature').mean()
print(df.sort_values(by='accuracy',ascending=False).head(10))

Base Model: Accuracy 0.8278109824639719
---------------------------------------------------
x4     0.258157
x2     0.197320
x11    0.161981
x12    0.083428
x14    0.068744
x1     0.057981
x5     0.056410
x3     0.028748
x13    0.023012
x7     0.016375
dtype: float64
---------------------------------------------------
         accuracy
feature          
x4       0.349530
x2       0.244435
x11      0.231150
x5       0.084688
x14      0.051676
x1       0.046133
x12      0.042515
x13      0.029588
x3       0.026349
x8       0.004360
