In [187]:

import numpy as np

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

from sklearn.metrics import accuracy_score, mean_squared_error


class MyRandomForestClassifier:
    def __init__(self, n_estimators=250, criterion="gini",max_depth=None,min_samples_split=2,min_samples_leaf=1,min_weight_fraction_leaf=0.,max_features="auto",max_leaf_nodes=None,min_impurity_decrease=0.0,min_impurity_split=None,bootstrap=True,oob_score=True,n_jobs=None,random_state=None,verbose=0,warm_start=False,class_weight=None):
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split
         
    
    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.n_samples = X.shape[0]
        index_set = set(range(self.n_samples))
        self.bootstrapped_indices = [np.random.choice(self.n_samples, size=self.n_samples) for _ in range(self.n_estimators)]
        self.oob_indices = [list(index_set - set(b)) for b in self.bootstrapped_indices]
        self.estimators = [DecisionTreeClassifier("criterion", "max_depth", "min_samples_split","min_samples_leaf", "min_weight_fraction_leaf","max_features", "max_leaf_nodes","min_impurity_decrease", "min_impurity_split","random_state").fit(X[b],y[b]) for b in self.bootstrapped_indices]

    def predict_proba(self, X):
        class_probs = np.zeros((X.shape[0], self.n_classes))
        for tree in self.estimators:
            class_probs += tree.predict_proba(X)
        return class_probs / self.n_estimators
    
    def predict(self, X):
        return self.predict_proba(X).argmax(axis=1)

# class MyRandomForestRegressor:
#     def __init__(self, n_trees=50, max_features='sqrt', **kwargs):
#         self.n_trees = n_trees
#         self.params = kwargs
#         self.params['max_features'] = max_features
    
#     def fit(self, X, y):
#         '''X & y must be numpy arrays'''
#         self.n_classes = len(np.unique(y))
#         self.n_samples = X.shape[0]
#      # generate bootstrap sample indices
#         index_set = set(range(self.n_samples))
#         self.bootstrapped_indices = [np.random.choice(self.n_samples, size=self.n_samples) for _ in range(self.n_trees)]
#         self.oob_indices = [list(index_set - set(b)) for b in self.bootstrapped_indices]

#        # fit all the trees!
#         self.estimators = [DecisionTreeRegressor(**self.params).fit(X[b],y[b]) for b in self.bootstrapped_indices]

#     def predict(self, X):
#         return sum(tree.predict(X) for tree in self.estimators)/self.n_trees



def shuffle_column(X, feature_index):
    X_new = X.copy()
    np.random.shuffle(X_new[:,feature_index])
    return X_new   

def permutation_importance(model, X_test, y_test, scorer):
    feat_importances = np.zeros(X_test.shape[1])
    test_score = scorer(model.predict(X_test), y_test)
    for i in range(X_test.shape[1]):
        X_test_shuffled = shuffle_column(X_test, i)
        test_score_permuted = abs(scorer(y_test, model.predict(X_test_shuffled)))
        feat_importances[i] = test_score - test_score_permuted
    return feat_importances

def my_oob_permutation_importance(my_rf_model, X, y, scorer):
    feat_importances = np.zeros(X.shape[1])
    for oob, est in zip(my_rf_model.oob_indices, my_rf_model.estimators):
        feat_importances += permutation_importance(est, X[oob], y[oob], scorer=scorer)
    for i in range(X.shape[1]):
        return feat_importances / my_rf_model.n_trees


  



In [170]:
# from abc import ABCMeta, abstractmethod
# from sklearn.tree import DecisionTreeClassifier
# class MyRandomForestClassifier:
#     def __init__(self, n_estimators='warn',criterion="gini",max_depth=None,min_samples_split=2,min_samples_leaf=1, min_weight_fraction_leaf=0.0,max_features="auto",max_leaf_nodes=None,min_impurity_decrease=0.0,min_impurity_split=None,bootstrap=True,oob_score=False,n_jobs=None,random_state=None,verbose=0,warm_start=False,class_weight=None):
#         #super(MyRandomForestClassifier, self).__init__(base_estimator=DecisionTreeClassifier(),n_estimators=n_estimators,estimator_params=("criterion", "max_depth", "min_samples_split","min_samples_leaf", "min_weight_fraction_leaf","max_features", "max_leaf_nodes","min_impurity_decrease", "min_impurity_split","random_state"),bootstrap=bootstrap,oob_score=oob_score,n_jobs=n_jobs,random_state=random_state,verbose=verbose,warm_start=warm_start,class_weight=class_weight)
#         self.n_estimators = n_estimators
#         self.criterion = criterion
#         self.max_depth = max_depth
#         self.min_samples_split = min_samples_split
#         self.min_samples_leaf = min_samples_leaf
#         self.min_weight_fraction_leaf = min_weight_fraction_leaf
#         self.max_features = max_features
#         self.max_leaf_nodes = max_leaf_nodes
#         self.min_impurity_decrease = min_impurity_decrease
#         self.min_impurity_split = min_impurity_split


        
#     def fit(self, X, y):
#         self.n_classes = len(np.unique(y))
#         self.n_samples = X.shape[0]
#         index_set = set(range(self.n_samples))
#         self.bootstrapped_indices = [np.random.choice(self.n_samples, size=self.n_samples) for _ in range(self.n_estimators)]
#         self.oob_indices = [list(index_set - set(b)) for b in self.bootstrapped_indices]
#         self.estimators = [DecisionTreeClassifier("criterion", "max_depth", "min_samples_split","min_samples_leaf", "min_weight_fraction_leaf","max_features", "max_leaf_nodes","min_impurity_decrease", "min_impurity_split","random_state").fit(X[b],y[b]) for b in self.bootstrapped_indices]

#     def predict_proba(self, X):
#         class_probs = np.zeros((X.shape[0], self.n_classes))
#         for tree in self.estimators:
#             class_probs += tree.predict_proba(X)
#         return class_probs / self.n_estimators
    
#     def predict(self, X):
#         return self.predict_proba(X).argmax(axis=1)


In [188]:
import pandas as pd
df=pd.read_csv("/Users/ralagianambi/Desktop/df_aws_version1.csv")


In [189]:
df.head()

Unnamed: 0,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,month,device_short_S1F0,device_short_S1F1,device_short_W1F0,device_short_W1F1,device_short_Z1F0,device_short_Z1F1,device_short_Z1F2
0,0,2,56,0,52,6,2,0,0,7,1,1,0,0,0,0,0,0
1,0,1,0,3,0,6,2,0,0,0,1,1,0,0,0,0,0,0
2,0,2,0,0,0,12,1,0,0,0,1,1,0,0,0,0,0,0
3,0,1,0,0,0,6,2,0,0,0,1,1,0,0,0,0,0,0
4,0,2,0,0,0,15,2,0,0,3,1,1,0,0,0,0,0,0


In [190]:
y=df.failure.values
df.drop("failure", axis=1, inplace=True)

In [191]:
df.columns[0]

'attribute1'

In [192]:
X=df.values

In [193]:
for i in range(X.shape[1]):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16


In [194]:
rf=MyRandomForestClassifier(max_depth=5, max_features=30,min_samples_leaf=2, min_samples_split=3,n_estimators=450) 


In [195]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.33, random_state=42)

In [196]:
rf

<__main__.MyRandomForestClassifier at 0x1083a7320>

In [197]:


rf_fit=rf.fit(X_train,y_train)

ValueError: 'min_impurity_decrease' cannot be used to seed a numpy.random.RandomState instance

In [24]:
# from sklearn.metrics import log_loss
# permutation_importance(rf, X_train,y_train, log_loss)

In [23]:
# from sklearn.model_selection import KFold, cross_val_score
my_oob_permutation_importance(rf, X_train,y_train, scorer=log_loss)

(array([-1.27237731e-03, -4.23813105e-02, -6.42370301e-03, -5.36797755e-02,
        -4.61372060e-02, -1.82855658e-02, -1.27411278e-02, -1.42893829e-02,
        -1.30027522e-02, -2.38639963e-02, -1.06224256e-02, -8.02302624e-03,
        -1.18929123e-02, -6.64236095e-03, -2.88790307e-03, -3.00818262e-03,
         4.26518659e-07]), 'attribute1')