In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold

from utils.metrics import ndcg_scorer

In [8]:
train_users = pd.read_csv('../data/processed/ohe_count_processed_train_users.csv', nrows=10)
train_users.fillna(-1, inplace=True)
y_train = train_users['country_destination']
train_users.drop('country_destination', axis=1, inplace=True)
train_users.drop('id', axis=1, inplace=True)
x_train = train_users.values
label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=41)
clf = CustomOneVsOneClassifier(rf)

In [9]:
classes_ = np.unique(encoded_y_train)
n_classes = classes_.shape[0]
print 'Classes:', n_classes, classes_

Classes: 3 [0 1 2]


In [11]:
# kf = KFold(len(x_train), n_folds=5, random_state=42)

# score = cross_val_score(clf, x_train, encoded_y_train,
#                         cv=kf, scoring=ndcg_scorer)

In [40]:
clf.fit(x_train, encoded_y_train)
clf.predict_proba(x_train)

array([[ 1.83333333,  1.06666667,  0.1       ],
       [ 1.86666667,  1.        ,  0.13333333],
       [ 0.56666667,  2.26666667,  0.16666667],
       [-0.5       ,  1.        ,  2.5       ],
       [-0.46666667,  2.2       ,  1.26666667],
       [ 0.6       ,  2.33333333,  0.06666667],
       [ 0.53333333,  2.3       ,  0.16666667],
       [-0.5       ,  2.3       ,  1.2       ],
       [ 0.5       ,  2.33333333,  0.16666667],
       [ 0.5       ,  2.33333333,  0.16666667]])

In [39]:
from __future__ import division

import numpy as np

from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import _fit_binary, check_is_fitted, is_regressor
from sklearn.externals.joblib import Parallel, delayed
from utils.unbalanced_dataset import SMOTE
from utils.unbalanced_dataset import SMOTEENN

def _predict_binary(estimator, X):
    """Make predictions using a single binary estimator."""
    if is_regressor(estimator):
        return estimator.predict(X)
    try:
        score = np.ravel(estimator.decision_function(X))
    except (AttributeError, NotImplementedError):
        # probabilities of the positive class
        score = estimator.predict_proba(X)[:, 1]
    return score


def _ovr_decision_function(predictions, confidences, n_classes):
    n_samples = predictions.shape[0]
    votes = np.zeros((n_samples, n_classes))
    sum_of_confidences = np.zeros((n_samples, n_classes))

    k = 0
    for i in range(n_classes):
        for j in range(i + 1, n_classes):
            sum_of_confidences[:, i] -= confidences[:, k]
            sum_of_confidences[:, j] += confidences[:, k]
            votes[predictions[:, k] == 0, i] += 1
            votes[predictions[:, k] == 1, j] += 1
            k += 1
            
    max_confidences = sum_of_confidences.max()
    min_confidences = sum_of_confidences.min()

    if max_confidences == min_confidences:
        return votes

    # Scale the sum_of_confidences to (-0.5, 0.5) and add it with votes.
    # The motivation is to use confidence levels as a way to break ties in
    # the votes without switching any decision made based on a difference
    # of 1 vote.
    eps = np.finfo(sum_of_confidences.dtype).eps
    max_abs_confidence = max(abs(max_confidences), abs(min_confidences))
    scale = (0.5 - eps) / max_abs_confidence
    return votes + sum_of_confidences * scale

def _fit_ovo_binary(estimator, X, y, i, j, sampling=None, verbose=False):
    """Fit a single binary estimator (one-vs-one)."""
    cond = np.logical_or(y == i, y == j)
    y = y[cond]
    y_binary = np.empty(y.shape, np.int)
    y_binary[y == i] = 0
    y_binary[y == j] = 1
    ind = np.arange(X.shape[0])
    X_values = X[ind[cond]]
    y_values = y_binary

    if sampling:
        ones = np.count_nonzero(y_values == 1)
        zeros = np.count_nonzero(y_values == 0)

        if sampling == 'SMOTE':
            ratio = abs(ones - zeros) / min(ones, zeros)
            smote = SMOTE(ratio=ratio, verbose=verbose)

        if sampling == 'SMOTEENN':
            ratio = (abs(ones - zeros) / min(ones, zeros)) * 0.3
            smote = SMOTEENN(ratio=ratio, verbose=verbose)

        X_values, y_values = smote.fit_transform(X_values, y_values)

    return _fit_binary(estimator, X_values, y_values, classes=[i, j])


class CustomOneVsOneClassifier(OneVsOneClassifier):

    def __init__(self, estimator, n_jobs=1, sampling=None, verbose=False):
        self.estimator = estimator
        self.n_jobs = n_jobs
        self.sampling = sampling
        self.verbose = verbose

    def predict_proba(self, X):
        return self.decision_function(X)

    def fit(self, X, y):
        y = np.asarray(y)

        self.classes_ = np.unique(y)
        n_classes = self.classes_.shape[0]

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_ovo_binary)(
                self.estimator, X, y, self.classes_[i], self.classes_[j],
                sampling=self.sampling, verbose=self.verbose
            ) for i in range(n_classes) for j in range(i + 1, n_classes))

        return self
    
    def decision_function(self, X):
        check_is_fitted(self, 'estimators_')

        predictions = np.vstack([est.predict(X) for est in self.estimators_]).T
        confidences = np.vstack([_predict_binary(est, X) for est in self.estimators_]).T
        return _ovr_decision_function(predictions, confidences,
                                      len(self.classes_))