In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble.forest import _generate_unsampled_indices
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE



In [2]:
n_classes = 3

data = pd.read_csv('../data/features_vec_all.csv')

feature_names = ['num_pre', 'num_post', 'mean_time', 'len_description',
       'num_movies', 'num_images', 'is_action', 'is_adventure',
       'is_casual', 'is_mmo', 'is_racing', 'is_rpg', 'is_simulation',
       'is_sports', 'is_strategy']

X = data[feature_names]

# log transform on the data and rescale
for i in X.index:
    X.at[i, 'num_pre'] = np.log(1 + 100*X.at[i, 'num_pre'])
    X.at[i,'num_post'] = np.log(1 + 200*X.at[i,'num_post'])
    X.at[i,'mean_time'] = np.log(1 + 200*X.at[i,'mean_time'])
    X.at[i,'len_description'] = np.log(1+X.at[i,'len_description'])

X = X.values
Y = data['sentiment'].values

# rescale only the continuous variables by (x- mu) / std
scaler = StandardScaler(copy=False)
scaler.fit_transform(X[:,0:5])

array([[ 1.44021359,  1.6902414 , -0.02933151, -0.98009167, -0.48452405],
       [ 1.66325794, -0.07596626,  0.17343286,  1.03378327, -0.48452405],
       [-0.81503802, -1.30951611, -4.73423637,  0.52779312, -0.48452405],
       ...,
       [ 1.78386201, -0.91052048, -0.73385516,  0.48007219,  2.37872215],
       [ 0.89260491,  0.45522693, -0.63960993,  1.19740062, -0.48452405],
       [ 1.11499354, -0.22785182, -0.07240274,  0.92677964,  1.42430675]])

In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, test_size=0.3)

X_resampled, Y_resampled = SMOTE().fit_resample(X_train, Y_train)

rf = RandomForestClassifier(n_estimators=10,
                            max_depth=6)

In [6]:
rf.fit(X_resampled, Y_resampled)

Y_pred = rf.predict(X_resampled)
print('confusion matrix on training data')
print(confusion_matrix(Y_resampled, Y_pred))
print(f1_score(Y_resampled, Y_pred, average='macro'))

Y_pred = rf.predict(X_test)

print('confusion matrix on test data')
print(confusion_matrix(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred, average='macro'))


confusion matrix on training data
[[4186  279  629]
 [1643 1410 2041]
 [ 493  607 3994]]
0.5981068946853596
confusion matrix on test data
[[  36   47  112]
 [ 126  213  590]
 [ 212  250 1721]]
0.3899542833348451


In [32]:
def f1(rf, X, Y):
    
    classes = {0:-1, 1:0, 2:1}
    
    n_samples = len(X)
    n_classes = len(np.unique(Y))
    
    for tree in rf.estimators_:
        unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples)
        tree_preds = tree.predict(X[unsampled_indices, :])

    # need to relabel because the predict method produces class labels (0, 1, 2)
    Y_preds = [classes[tree_preds[i]] for i in range(len(tree_preds))]
        
    return f1_score(Y_preds, Y[unsampled_indices], average='macro')
        

# X, Y are training data/target
def permutation_importance(rf, X, Y, metric):
    
    if not hasattr(rf, 'estimators_'):
        rf.fit(X, Y)
    
    baseline_score = metric(rf, X, Y)
    importances = []
    
    num_cols = X.shape[1] # number of columns/features
    
    X_train = X.copy()
    
    for k in range(num_cols):
        save = X_train[:,k:k+1].copy()
        X_train[:,k:k+1] = np.random.permutation(X_train[:,k:k+1]) # permute value of this
        
        permuted_score = metric(rf, X_train, Y)
        X_train[:,k:k+1] = save
        
        importances.append(baseline_score - permuted_score)
    
    return importances

In [33]:
imp = permutation_importance(rf, X_resampled, Y_resampled, f1)
print('permutation importance ={} +/-{}'.format(np.mean(imp), np.std(imp)))
print(imp)

permutation importance =0.014397522498943941 +/-0.016817341826894483
[0.009503373250978553, 0.049537309251046435, 0.009478763677066349, 0.0005967343138573167, 0.004322480217065328, 0.017245943281857967, 0.009034151035248572, 0.012238645697513262, 0.03524830120367689, 0.0, 0.0, 0.006721092081611202, 0.009133614617474473, 0.0, 0.05290242885676277]


In [17]:
from eli5.sklearn import PermutationImportance
from eli5.permutation_importance import get_score_importances
from sklearn.metrics import accuracy_score

In [20]:
A = PermutationImportance(rf, scoring='f1_macro').fit(X_test, Y_test)
A.feature_importances_

array([ 0.00066194,  0.01007265,  0.00035529, -0.00053539,  0.00285996,
       -0.00352   ,  0.01365503,  0.00801022,  0.00631834,  0.00577723,
        0.00065584,  0.00054276,  0.01931771, -0.00106626,  0.01529574])