In [159]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
import numpy as np
import scipy.sparse
import collections
X_train = scipy.sparse.load_npz("X_train.npz")
y_train = np.load("y_train.npy")
X_test = scipy.sparse.load_npz("X_test.npz")
y_test = np.load("y_test.npy")

In [160]:
rf=RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)
rfPred = rf.predict(X_test)
print(collections.Counter(rfPred))
print(balanced_accuracy_score(y_test, rfPred)) 

Counter({0: 120, 1: 11})
0.8666666666666667


In [161]:
rfWeighted=RandomForestClassifier(n_estimators=100, class_weight="balanced")
rfWeighted.fit(X_train,y_train)
rfWeightedPred = rfWeighted.predict(X_test)
print(collections.Counter(rfWeightedPred))
print(balanced_accuracy_score(y_test, rfWeightedPred))

Counter({0: 115, 1: 16})
0.9580459770114942


In [162]:
from sklearn.utils import resample
X_train_np = X_train.toarray()
class_0_indices = [i for i, x in enumerate(y_train==0) if x]
class_1_indices = [i for i, x in enumerate(y_train==1) if x]
size_class_0 = sum(y_train==0)
X_train_class_0 = X_train_np[class_0_indices,:]
y_train_class_0 = [0]*size_class_0
X_train_class_1 = X_train_np[class_1_indices,:]

In [163]:
X_train_class_1_resampled = resample(X_train_class_1, replace=True, n_samples=size_class_0)
y_train_class_1_resampled = [1]*size_class_0

In [164]:
X_train_resampled = np.concatenate([X_train_class_0,X_train_class_1_resampled])
y_train_resampled = y_train_class_0+y_train_class_1_resampled

In [165]:
from scipy import sparse
X_train_resampled = sparse.csr_matrix(X_train_resampled)

In [166]:
rfResampled=RandomForestClassifier(n_estimators=100)
rfResampled.fit(X_train_resampled,y_train_resampled)
rfResampledPred = rfResampled.predict(X_test)
print(collections.Counter(rfResampledPred))
print(balanced_accuracy_score(y_test, rfResampledPred))

Counter({0: 114, 1: 17})
0.9913793103448276


In [167]:
X_train_np = X_train.toarray()
class_0_indices = [i for i, x in enumerate(y_train==0) if x]
class_1_indices = [i for i, x in enumerate(y_train==1) if x]
size_class_1 = sum(y_train==1)
X_train_class_1 = X_train_np[class_1_indices,:]
y_train_class_1 = [1]*size_class_1
X_train_class_0 = X_train_np[class_0_indices,:]
X_train_class_0_downsampled = resample(X_train_class_0, replace=False, n_samples=size_class_1)
y_train_class_0_downsampled = [0]*size_class_1

In [168]:
X_train_downsampled = np.concatenate([X_train_class_1,X_train_class_0_downsampled])
y_train_downsampled = y_train_class_1+y_train_class_0_downsampled

In [169]:
X_train_downsampled = sparse.csr_matrix(X_train_downsampled)

In [170]:
rfDownsampled=RandomForestClassifier(n_estimators=100)
rfDownsampled.fit(X_train_downsampled,y_train_downsampled)
rfDownsampledPred = rfDownsampled.predict(X_test)
print(collections.Counter(rfDownsampledPred))
print(balanced_accuracy_score(y_test, rfDownsampledPred))

Counter({0: 112, 1: 19})
0.9827586206896552


In [173]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
BBC = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),sampling_strategy='auto',replacement=False)
BBC.fit(X_train, y_train) 
BBCPred = BBC.predict(X_test)
print(collections.Counter(BBCPred))
print(balanced_accuracy_score(y_test, BBCPred))  

Counter({0: 111, 1: 20})
0.978448275862069
