In [84]:
import pandas as pd
from utils.data_loading import train_users
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [85]:
train_users = train_users.head(100)
y_train = train_users['country_destination']
train_users.drop('country_destination', axis=1, inplace=True)
train_users.drop('id', axis=1, inplace=True)
train_users = train_users.fillna(-1)

x_train = train_users.values


label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)

In [120]:
"""Multiclass and multilabel classification strategies.

This module implements an one-vs-one multiclass learning algorithm that uses
SMOTE algorithm to over sample the minority class in each fit.
"""
from __future__ import division

import numpy as np

from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import _fit_binary, _predict_binary, _ovr_decision_function
from sklearn.multiclass import check_is_fitted
from sklearn.externals.joblib import Parallel, delayed
from utils.unbalanced_dataset import SMOTE
from utils.unbalanced_dataset import SMOTEENN


def _fit_ovo_binary(estimator, X, y, i, j, sampling=None, verbose=False):
    """Fit a single binary estimator (one-vs-one)."""
    cond = np.logical_or(y == i, y == j)
    y = y[cond]
    y_binary = np.empty(y.shape, np.int)
    y_binary[y == i] = 0
    y_binary[y == j] = 1
    ind = np.arange(X.shape[0])

    X_values = X[ind[cond]]
    y_values = y_binary

    if sampling:
        ones = np.count_nonzero(y_values == 1)
        zeros = np.count_nonzero(y_values == 0)

        if sampling == 'SMOTE':
            ratio = abs(ones - zeros) / min(ones, zeros)
            smote = SMOTE(ratio=ratio, verbose=verbose)

        if sampling == 'SMOTEENN':
            ratio = (abs(ones - zeros) / min(ones, zeros)) * 0.3
            smote = SMOTEENN(ratio=ratio, verbose=verbose)

        X_values, y_values = smote.fit_transform(X_values, y_values)

    return _fit_binary(estimator, X_values, y_values, classes=[i, j])


class CustomOneVsOneClassifier(OneVsOneClassifier):
    """One-vs-one multiclass strategy.

    This strategy consists in fitting one classifier per class pair.
    At prediction time, the class which received the most votes is selected.

    Requires to fit `n_classes * (n_classes - 1) / 2` classifiers.

    Attributes
    ----------
    estimators_ : list of `n_classes * (n_classes - 1) / 2` estimators
        Estimators used for predictions.
    classes_ : numpy array of shape [n_classes]
        Array containing labels.
    """

    def __init__(self, estimator, n_jobs=1, sampling=None, verbose=False):
        """Init method.

        Parameters
        ----------
        estimator : estimator object
            An estimator object implementing fit and one of decision_function
            or predict_proba.
        n_jobs : int, optional, default: 1
            The number of jobs to use. If -1 all CPUs are used.
            If 1 is given, no parallel computing code is used at all, which is
            useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs)
            are used. Thus for n_jobs = -2, all CPUs but one are used.
        sampling : str, optional default:None
            Samplig method to use when fitting each estimator.
            Can be 'SMOTE' or SMOTEENN'.
        """
        self.estimator = estimator
        self.n_jobs = n_jobs
        self.sampling = sampling
        self.verbose = verbose

    def predict_proba(self, X):
        """Predict class probabilities for X.

        The predicted class probabilities of an input sample is the same as
        the result of decision_function.

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]

        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples.
        """
        return self.decision_function(X)

    def fit(self, X, y):
        """Fit underlying estimators.

        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            Data.
        y : array-like, shape = [n_samples]
            Multi-class targets.

        Returns
        -------
        self
        """
        y = np.asarray(y)

        self.classes_ = np.unique(y)
        n_classes = self.classes_.shape[0]

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_ovo_binary)(
                self.estimator, X, y, self.classes_[i], self.classes_[j],
                sampling=self.sampling, verbose=self.verbose
            ) for i in range(n_classes) for j in range(i + 1, n_classes))

        return self

    def decision_function(self, X):
        """Decision function for the OneVsOneClassifier.
        The decision values for the samples are computed by adding the
        normalized sum of pair-wise classification confidence levels to the
        votes in order to disambiguate between the decision values when the
        votes for all the classes are equal leading to a tie.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        Y : array-like, shape = [n_samples, n_classes]
        """
        check_is_fitted(self, 'estimators_')
        predictions = np.vstack([est.predict(X) for est in self.estimators_]).T
        confidences = np.vstack([_predict_binary(est, X) for est in self.estimators_]).T
        return _ovr_decision_function(predictions, confidences,
                                      len(self.classes_))

def _ovr_decision_function(predictions, confidences, n_classes):
    """Compute a continuous, tie-breaking ovr decision function.
    It is important to include a continuous value, not only votes,
    to make computing AUC or calibration meaningful.
    Parameters
    ----------
    predictions : array-like, shape (n_samples, n_classifiers)
        Predicted classes for each binary classifier.
    confidences : array-like, shape (n_samples, n_classifiers)
        Decision functions or predicted probabilities for positive class
        for each binary classifier.
    n_classes : int
        Number of classes. n_classifiers must be
        ``n_classes * (n_classes - 1 ) / 2``
    """
    n_samples = predictions.shape[0]
    votes = np.zeros((n_samples, n_classes))
    sum_of_confidences = np.zeros((n_samples, n_classes))

    k = 0
    for i in range(n_classes):
        for j in range(i + 1, n_classes):
            sum_of_confidences[:, i] -= confidences[:, k]
            sum_of_confidences[:, j] += confidences[:, k]
            votes[predictions[:, k] == 0, i] += 1
            votes[predictions[:, k] == 1, j] += 1
            k += 1

    max_confidences = sum_of_confidences.max()
    min_confidences = sum_of_confidences.min()

    if max_confidences == min_confidences:
        return votes

    # Scale the sum_of_confidences to (-0.5, 0.5) and add it with votes.
    # The motivation is to use confidence levels as a way to break ties in
    # the votes without switching any decision made based on a difference
    # of 1 vote.
    eps = np.finfo(sum_of_confidences.dtype).eps
    max_abs_confidence = max(abs(max_confidences), abs(min_confidences))
    scale = (0.5 - eps) / max_abs_confidence
    print votes
    return votes + sum_of_confidences * scale

In [121]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1)
clf = CustomOneVsOneClassifier(rf)

In [122]:
clf.fit(x_train, encoded_y_train)

CustomOneVsOneClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
             n_jobs=1, sampling=None, verbose=False)

In [123]:
preds = clf.predict_proba(x_train[0:10])

[[ 4.  0.  5.  2.  3.  7.  6.  1.]
 [ 4.  0.  5.  3.  1.  7.  6.  2.]
 [ 3.  0.  5.  3.  3.  6.  7.  1.]
 [ 2.  0.  5.  1.  3.  7.  5.  5.]
 [ 4.  0.  5.  2.  3.  6.  7.  1.]
 [ 5.  0.  4.  2.  2.  6.  7.  2.]
 [ 3.  1.  5.  3.  1.  6.  7.  2.]
 [ 3.  1.  5.  3.  1.  6.  7.  2.]
 [ 3.  1.  5.  4.  1.  6.  7.  1.]
 [ 3.  0.  4.  4.  3.  6.  7.  1.]]


In [113]:
print y_train.values[0:10]
print encoded_y_train[0:10]
np.argsort(preds * -1)

['NDF' 'NDF' 'US' 'other' 'US' 'US' 'US' 'US' 'US' 'US']
[5 5 6 7 6 6 6 6 6 6]


array([[5, 6, 2, 0, 3, 7, 1, 4],
       [5, 6, 2, 0, 7, 3, 1, 4],
       [6, 5, 2, 0, 7, 3, 1, 4],
       [7, 5, 6, 2, 0, 3, 4, 1],
       [6, 5, 2, 7, 3, 0, 4, 1],
       [6, 5, 2, 3, 0, 7, 1, 4],
       [6, 5, 2, 0, 7, 1, 3, 4],
       [6, 5, 2, 0, 7, 1, 3, 4],
       [6, 5, 2, 0, 7, 1, 3, 4],
       [6, 5, 2, 7, 3, 0, 1, 4]])