In [200]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold

from utils.metrics import ndcg_scorer

In [201]:
train_users = pd.read_csv('../data/processed/ohe_count_processed_train_users.csv', nrows=15)
train_users.fillna(-1, inplace=True)
y_train = train_users['country_destination']
train_users.drop('country_destination', axis=1, inplace=True)
train_users.drop('id', axis=1, inplace=True)
x_train = train_users.values
label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=41)
clf = CustomOneVsOneClassifier(rf)

In [202]:
classes_ = np.unique(encoded_y_train)
n_classes = classes_.shape[0]
print 'Classes:', n_classes, classes_

Classes: 4 [0 1 2 3]


In [194]:
# kf = KFold(len(x_train), n_folds=5, random_state=42)

# score = cross_val_score(clf, x_train, encoded_y_train,
#                         cv=kf, scoring=ndcg_scorer)
# print score.mean()

In [203]:
clf.fit(x_train, encoded_y_train)
clf.predict_proba(x_train)

array([[-0.48,  3.14,  2.18,  1.16],
       [-0.46,  3.14,  2.12,  1.2 ],
       [-0.46,  1.98,  3.26,  1.22],
       [-0.48,  1.92,  2.18,  2.38],
       [-0.42,  2.96,  2.24,  1.22],
       [ 0.56,  1.96,  3.4 ,  0.08],
       [-0.5 ,  1.98,  3.38,  1.14],
       [ 1.6 ,  0.86,  3.38,  0.16],
       [-0.5 ,  2.  ,  3.36,  1.14],
       [ 0.56,  1.96,  3.38,  0.1 ],
       [-0.5 ,  2.04,  3.34,  1.12],
       [ 0.62,  3.14,  2.12,  0.12],
       [ 2.84,  1.98,  1.14,  0.04],
       [ 0.6 ,  3.14,  2.14,  0.12],
       [-0.44,  3.18,  2.12,  1.14]])

In [199]:
from __future__ import division

import numpy as np

from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import _fit_binary, check_is_fitted, _predict_binary, _ovr_decision_function
from sklearn.externals.joblib import Parallel, delayed
from utils.unbalanced_dataset import SMOTE
from utils.unbalanced_dataset import SMOTEENN


def _confidence_matrix(confidences, n_classes):
    confidence_matrix = np.zeros((n_classes, n_classes))
    
    # Sum by antimsimetrical matrix columns

    return confidence_matrix

def _generate_instances(X_values, y_values, sampling=None, ratio=1, verbose=False):
        if sampling == 'SMOTE':
            smote = SMOTE(ratio=ratio, verbose=verbose)

        if sampling == 'SMOTEENN':
            ratio = ratio * 0.3
            smote = SMOTEENN(ratio=ratio, verbose=verbose)

        return smote.fit_transform(X_values, y_values)

def _fit_ovo_binary(estimator, X, y, i, j, sampling=None, verbose=False):
    """Fit a single binary estimator (one-vs-one)."""
    cond = np.logical_or(y == i, y == j)
    y = y[cond]
    y_binary = np.empty(y.shape, np.int)
    y_binary[y == i] = 0
    y_binary[y == j] = 1
    ind = np.arange(X.shape[0])
    X_values = X[ind[cond]]
    y_values = y_binary

    if sampling:
        ones = np.count_nonzero(y_values == 1)
        zeros = np.count_nonzero(y_values == 0)
        ratio = abs(ones - zeros) / min(ones, zeros)
        X_values, y_values = _generate_instances(X_values, y_values, sampling=sampling, ratio=ratio)
        
    return _fit_binary(estimator, X_values, y_values, classes=[i, j])


class CustomOneVsOneClassifier(OneVsOneClassifier):

    def __init__(self, estimator, n_jobs=1, sampling=None, strategy='vote', verbose=False):
        self.estimator = estimator
        self.n_jobs = n_jobs
        self.sampling = sampling
        self.verbose = verbose
        self.strategy = strategy
    

    def predict_proba(self, X):
        return self.decision_function(X)

    def fit(self, X, y):

        if self.strategy is not None:
            valid_presets = ('vote', 'weighted_vote', 'dynamic_vote', 'relative_competence')
            if self.strategy not in valid_presets:
                raise ValueError('Strategy %s is not valid. '
                                 'Allowed values are: vote, weighted_vote,'
                                 ' dynamic_vote and relative_competence.'
                                  % (self.strategy))
                
        if self.sampling is not None:
            valid_presets = ('SMOTE', 'SMOTEENN')
            if self.sampling not in valid_presets:
                raise ValueError('Sampling %s is not valid. '
                                 'Allowed values are: SMOTE, SMOTEENN.'
                                  % (self.sampling))
        
        y = np.asarray(y)

        self.classes_ = np.unique(y)
        n_classes = self.classes_.shape[0]

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_ovo_binary)(
                self.estimator, X, y, self.classes_[i], self.classes_[j],
                sampling=self.sampling, verbose=self.verbose
            ) for i in range(n_classes) for j in range(i + 1, n_classes))

        return self
    
    def decision_function(self, X):
        check_is_fitted(self, 'estimators_')

        predictions = np.vstack([est.predict(X) for est in self.estimators_]).T
        confidences = np.vstack([_predict_binary(est, X) for est in self.estimators_]).T
        
        if self.strategy == 'weighted_vote': 
            # Compute matrix for each confidence
            # Flatten confidences in axis 1 with sum
            # return confidences_sum
            raise NotImplementedError('Strategy weighted_vote not implemented.')
            
        elif self.strategy == 'dynamic_vote':
            # _dinamic_ovo(score_matrix, X, y)
            raise NotImplementedError('Strategy dynamic_vote not implemented.')
        
        elif self.strategy == 'relative_competence':
            raise NotImplementedError('Strategy relative_competence not implemented.')
        
        elif self.strategy== 'vote':
            return _ovr_decision_function(predictions, confidences,
                                      len(self.classes_))