In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold

from utils.metrics import ndcg_scorer

In [3]:
"""Multiclass and multilabel classification strategies.

This module implements an one-vs-one multiclass learning algorithm that uses
SMOTE algorithm to over sample the minority class in each fit.
"""
from __future__ import division

import numpy as np

from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import _fit_binary, check_is_fitted
from sklearn.multiclass import _ovr_decision_function, _predict_binary
from sklearn.externals.joblib import Parallel, delayed
from utils.unbalanced_dataset import SMOTE
from utils.unbalanced_dataset import SMOTEENN


def _score_matrix(confidences, n_classes):
    """Create a probability matrix of confidences."""
    # Make empty matrix
    matrix = np.zeros((n_classes, n_classes))

    # Fill upper triangle with v
    matrix[np.triu_indices(n_classes, 1)] = confidences

    # Fill lower triangle with the inverse of v
    for i in range(n_classes):
        for j in range(i, n_classes):
            matrix[j][i] = 1 - matrix[i][j]

    np.fill_diagonal(matrix, 0)

    return matrix


def _sample_values(X, y, method=None, ratio=1, verbose=False):
    """Performs any kind of sampling(over and under) with X and y."""
    if method == 'SMOTE':
        sampler = SMOTE(ratio=ratio, verbose=verbose)

    if method == 'SMOTEENN':
        ratio = ratio * 0.3
        sampler = SMOTEENN(ratio=ratio, verbose=verbose)

    return sampler.fit_transform(X, y)


def _fit_ovo_binary(estimator, X, y, i, j, sampling=None, verbose=False):
    """Fit a single binary estimator (one-vs-one)."""
    cond = np.logical_or(y == i, y == j)
    y = y[cond]
    y_binary = np.empty(y.shape, np.int)
    y_binary[y == i] = 0
    y_binary[y == j] = 1
    ind = np.arange(X.shape[0])

    X_values = X[ind[cond]]
    y_values = y_binary

    if sampling:
        ones = np.count_nonzero(y_values == 1)
        zeros = np.count_nonzero(y_values == 0)
        ratio = abs(ones - zeros) / min(ones, zeros)
        X_values, y_values = _sample_values(
            X_values, y_values, method=sampling, ratio=ratio)

    return _fit_binary(estimator, X_values, y_values, classes=[i, j])


class CustomOneVsOneClassifier(OneVsOneClassifier):
    """One-vs-one multiclass strategy.

    This strategy consists in fitting one classifier per class pair.
    At prediction time, the class which received the most votes is selected.

    Requires to fit `n_classes * (n_classes - 1) / 2` classifiers.

    Attributes
    ----------
    estimators_ : list of `n_classes * (n_classes - 1) / 2` estimators
        Estimators used for predictions.
    classes_ : numpy array of shape [n_classes]
        Array containing labels.
    """

    def __init__(self, estimator, n_jobs=1, sampling=None,
                 strategy='vote', verbose=False):
        """Init method.

        Parameters
        ----------
        estimator : estimator object
            An estimator object implementing fit and one of decision_function
            or predict_proba.
        n_jobs : int, optional, default: 1
            The number of jobs to use. If -1 all CPUs are used.
            If 1 is given, no parallel computing code is used at all, which is
            useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs)
            are used. Thus for n_jobs = -2, all CPUs but one are used.
        sampling : str, optional default:None
            Samplig method to use when fitting each estimator.
            Can be 'SMOTE' or SMOTEENN'.
        """
        self.estimator = estimator
        self.n_jobs = n_jobs
        self.sampling = sampling
        self.verbose = verbose
        self.strategy = strategy

    def predict_proba(self, X):
        """Predict class probabilities for X.

        The predicted class probabilities of an input sample is the same as
        the result of decision_function.

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]

        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples.
        """
        return self.decision_function(X)

    def fit(self, X, y):
        """Fit underlying estimators.

        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            Data.
        y : array-like, shape = [n_samples]
            Multi-class targets.

        Returns
        -------
        self
        """

        valid_strategies = ('vote', 'weighted_vote',
                            'dynamic_vote', 'relative_competence')
        if self.strategy not in valid_strategies:
            raise ValueError('Strategy %s is not valid. '
                             'Allowed values are: vote, weighted_vote,'
                             ' dynamic_vote and relative_competence.'
                             % (self.strategy))

        if self.sampling not in ('SMOTE', 'SMOTEENN', None):
            raise ValueError('Sampling %s is not valid. '
                             'Allowed values are: SMOTE, SMOTEENN.'
                             % (self.sampling))
        y = np.asarray(y)

        self.classes_ = np.unique(y)
        n_classes = self.classes_.shape[0]

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_ovo_binary)(
                self.estimator, X, y, self.classes_[i], self.classes_[j],
                sampling=self.sampling, verbose=self.verbose
            ) for i in range(n_classes) for j in range(i + 1, n_classes))

        return self

    def decision_function(self, X):
        """Decision function for the CustomOneVsOneClassifier.

        By default, the decision values for the samples are computed by adding
        the normalized sum of pair-wise classification confidence levels to the
        votes in order to disambiguate between the decision values when the
        votes for all the classes are equal leading to a tie.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        Y : array-like, shape = [n_samples, n_classes]
        """
        check_is_fitted(self, 'estimators_')

        predictions = np.vstack([est.predict(X) for est in self.estimators_]).T
        confidences = np.vstack([_predict_binary(est, X)
                                 for est in self.estimators_]).T

        n_clases = len(self.classes_)

        if self.strategy == 'weighted_vote':
            matrices = [_score_matrix(c, n_clases) for c in confidences]
            weighted_votes = np.vstack([np.sum(m, axis=0) for m in matrices])
            return weighted_votes

        elif self.strategy == 'dynamic_vote':
            score_matrices = [_score_matrix(c, n_clases) for c in confidences]
            
            # _dinamic_ovo(score_matrices, X, y)
            raise NotImplementedError('Strategy dynamic_vote not implemented.')

        elif self.strategy == 'relative_competence':
            raise NotImplementedError(
                'Strategy relative_competence not implemented.')

        elif self.strategy == 'vote':
            return _ovr_decision_function(predictions, confidences,
                                          n_clases)


In [8]:
train_users = pd.read_csv('../data/processed/ohe_count_processed_train_users.csv', nrows=50000)
train_users.fillna(-1, inplace=True)
y_train = train_users['country_destination']
train_users.drop('country_destination', axis=1, inplace=True)
train_users.drop('id', axis=1, inplace=True)
x_train = train_users.values
label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=41)
clf = CustomOneVsOneClassifier(rf, strategy='weighted_vote', sampling='SMOTE')

In [9]:
classes_ = np.unique(encoded_y_train)
n_classes = classes_.shape[0]
print 'Classes:', n_classes, classes_

Classes: 12 [ 0  1  2  3  4  5  6  7  8  9 10 11]


In [None]:
clf.fit(x_train, encoded_y_train)
clf.predict_proba(x_train)

array([[  1.8,   4.2,   4.2, ...,   2. ,   9.3,   6.3],
       [  1.3,   5.1,   3.9, ...,   2.5,   9.7,   6.5],
       [  3.3,   4.9,   3.7, ...,   2.2,  10.7,   7.7],
       ..., 
       [  3.1,   4.5,   4.6, ...,   0.7,  10.2,   8.1],
       [  3.2,   4.1,   4.3, ...,   1.5,   9.7,   7.4],
       [  3. ,   6.2,   3.5, ...,   1.2,   9.7,   7.1]])

In [None]:
kf = KFold(len(x_train), n_folds=10, random_state=42)

score = cross_val_score(clf, x_train, encoded_y_train,
                        cv=kf, scoring=ndcg_scorer)
print score.mean()