In [1]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import os

%matplotlib inline
%load_ext autoreload
%autoreload 2

from acf import *

  from numpy.core.umath_tests import inner1d


## Generate data

In [2]:
pos_dir = os.path.join("E:/", "data", "INRIAPerson", "train_64x128_H96", "pos")
neg_dir = os.path.join("E:/", "data", "INRIAPerson", "Train", "neg")

In [3]:
X, y = generate_data(pos_dir, neg_dir)

2416it [00:25, 96.55it/s] 
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [01:07<00:00, 73.68it/s]


In [4]:
print(X.shape, y.shape)

(7416, 5120) (7416,)


In [5]:
joblib.dump(X, "X_train.pkl")
joblib.dump(y, "y_train.pkl")

['y_train.pkl']

## Train

In [6]:
# X, y = shuffle(X, y)
from sklearn.model_selection import StratifiedKFold

# tune hyperparameters
stump_num = 200
lr = 0.1

skf = StratifiedKFold(n_splits=3, shuffle=True)
best_acc = 0.
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=stump_num, learning_rate=lr)
    clf.fit(X_train, y_train)
    acc = clf.score(X_val, y_val)
    if acc > best_acc:
        best_acc = acc
        
print("best accuracy: ", best_acc)

In [9]:
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=200, learning_rate=0.1)
clf.fit(X, y)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.1, n_estimators=200, random_state=None)

In [10]:
clf.score(X, y)

0.9990560949298813

In [19]:
joblib.dump(clf, "adaboost.v1.pkl")

['adaboost.v1.pkl']

## Test

In [11]:
pos_dir = os.path.join("E:/", "data", "INRIAPerson", "test_64x128_H96", "pos")
neg_dir = os.path.join("E:/", "data", "INRIAPerson", "Test", "neg")

In [12]:
X_test, y_test = generate_data(pos_dir, neg_dir)

1126it [00:07, 143.81it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [01:01<00:00, 81.53it/s]


In [13]:
print(X_test.shape, y_test.shape)

(6126, 5120) (6126,)


In [7]:
clf = joblib.load("adaboost.v1.pkl")

In [14]:
clf.score(X_test, y_test)

0.9809010773751224

## Hard negative

In [3]:
neg_dir = os.path.join("E:/", "data", "INRIAPerson", "Train", "neg")
clf = joblib.load("adaboost.v1.pkl")

In [None]:
X_hn, y_hn = generate_hard_neg(neg_dir, clf)
print(X_hn.shape, y_hn.shape)

## Test efficiency

In [None]:
clf = joblib.load("adaboost.v1.pkl")

In [18]:
import time

start = time.clock()
# clf.predict(X_test[0].reshape(1, -1))
clf.predict(X_test)
end = time.clock()

print('spent time: {0:.5f}s'.format((end-start)/X_test.shape[0]))

spent time: 0.00065s
