In [None]:
import numpy as np
import pandas as pd

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.base import clone

## Data Loading

In [None]:
import zipfile
import pandas as pd

with zipfile.ZipFile('data/train.csv.zip') as data_archive:
    dataset = pd.read_csv(data_archive.open('train.csv'), index_col='id')

## Analyzing Dataset

In [None]:
dataset.head()

* **id** -- anonimous idetificator;
* **feat_1, ..., feat_93** -- anonymous feature;
* **target** -- label

In [None]:
dataset['target'].value_counts()

In [None]:
dataset.describe().T.sample(10, random_state=42)

Split target value and features. **LabelEncoder** transforms string into numbers from $0$ to $K-1$, where $K$ -- number of all classes.

In [None]:
X = np.asarray(dataset.drop('target', axis=1), dtype=float)
target = dataset['target']

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder().fit(target)
y = label_encoder.transform(target)

Split dataset into train and validation part

In [None]:
from sklearn.model_selection import train_test_split

split = train_test_split(X, y, test_size=0.5,
                         random_state=42, stratify=y)
train_X, test_X, train_y, test_y = split

## Several Classes

There are two types of such problem

* **Multiclass** -- **single** label for every object
    * Digit recognition, type of the product

* **Mulitlabel** -- possibly **several** labels for every element
    * tags, list of objects in a picture

Today we will mostly work with **Mutliclass**.

## Meta Algorithms

Take a look at, **LinearSVC**.
This model works with **binary** classification

In [None]:
from sklearn.svm import LinearSVC
model = LinearSVC(random_state=0)

<img src="figures/ovr.png" title="one-vs-rest"/>

* **Fit**: for every $k \in \{1.. K\}$ fit a classifier
 $h_k$, whcih can separate $k$ from other labels
$y \neq k$;

```python
def fit(X, y):
    classifiers = []
    for i in range(len(classes)):
        y_i = np.where(y == classes[i], 1, 0)

        classifiers.append(clone(classifier).fit(X, y_i))
```

* **Predict**: apply all $K$ classifiers for an element  $x$:
    * select classifier with the biggets condifence

```python
def predict(X):
    scores = []
    for clf in classifiers:
        scores.append(clf.predict_proba(X)[:, 1])

    scores = np.stack(scores, axis=1)
    return classes[np.argmax(scores, axis=1)]
```

In [None]:
from sklearn.multiclass import OneVsRestClassifier

ovr_classifier = OneVsRestClassifier(clone(model), n_jobs=-1)
ovr_classifier.fit(train_X, train_y)

## Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
predict_y = ovr_classifier.predict(test_X)

pd.DataFrame(confusion_matrix(test_y, predict_y))

In [None]:
print("Accuracy %.3f%%" % (100 * ovr_classifier.score(test_X, test_y)))

<img src="figures/ovo.png" title="one-vs-one"/>

* **Fit**: for every pair $i, j \in \{1.. K\}$, $i \prec j$,
fit a clissifier $h_{ij}$, which separate $j$ from $i$;

```python
def fit(X, y):
    classifiers, n_classes = {}, len(classes)
    for i in range(n_classes):
        for j in range(i+1, n_classes):
            mask = (y == i) | (y == j)
            
            # j -- 1, i -- 0
            y_ij = np.where(y[mask] == j, 1, 0)
            classifiers[(i, j)] = clone(classifier).fit(X[mask], y_ij)
```

* **Predict**: apply all $\frac12K (K-1)$ classifiers for an element $x$ and choose the most class with the largest amounts of votes

```python
def predict(X):
    votes = np.zeros((n_samples, n_classes))
    for i in range(n_classes):
        for j in range(i+1, n_classes):
            predicted = classifiers[(i, j)].predict(X)

            votes[predicted == 0, i] += 1
            votes[predicted == 1, j] += 1
            
    return classes[np.argmax(votes, axis=1)]
```

In [None]:
from sklearn.multiclass import OneVsOneClassifier

ovo_classifier = OneVsOneClassifier(clone(model))

ovo_classifier.fit(train_X, train_y)

In [None]:
predict_y = ovo_classifier.predict(test_X)

pd.DataFrame(confusion_matrix(test_y, predict_y))

In [None]:
print("Accuracy %.3f%%" % (100 * ovo_classifier.score(test_X, test_y)))

## Label Encoding

| Class | C_1 | C_2 | C_3 | C_4 | C_5 | C_6 | ... | C_L |
|:-----:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
| 0     |  1  |  1  |  0  |  0  |  0  |  0  | ... |  0  |
| 1     |  0  |  0  |  1  |  1  |  1  |  1  | ... |  0  |
| 2     |  1  |  0  |  0  |  1  |  0  |  0  | ... |  1  |
| ...   | ... | ... | ... | ... | ... | ... | ... | ... |
| 8     |  1  |  1  |  0  |  1  |  0  |  1  | ... |  1  |
| 9     |  0  |  1  |  1  |  1  |  0  |  0  | ... |  0  |

* **Preprocessing**: for every label $\{1..K\}$ match a unique binary string of length $L$ -- class **code**;
* **Fit**: binary classifiers $0-1$ for every position in the binary string;

```python
def fit(X, y):
    classifiers, n_classes = [], len(classes)
    code_book = ... 
                    

    label_map = LabelEncoder().fit(y)
    class_index = label_map.transform(y)
    encoding = code_book.take(class_index, axis=0)

    for i in range(n_code_size):
        y_i = encoding[:, i]
        classifiers.append(clone(classifier).fit(X, y_i))
```

## Imbalanced Classification

Lets try to take a deeper look at one.vs.rest approach and do it manually

In [None]:
target_class = 0
target_class_count = (train_y == target_class).sum()
others_count = (train_y != target_class).sum()

In [None]:
print('{} target elements'.format(target_class_count))
print('{} others'.format(others_count))
print('{:.2f} imbalanced ratio'.format(others_count/target_class_count))

$$L(y, t) = \sum_{i=1}^N \max(0, 1 - t_i \cdot y_i)$$

$$Loss = \frac{n_1}{N}\sum_{i, y_i =1}L(1, f(x_i)) + \frac{n_{-1}}{N}\sum_{i, y_i=-1}L(-1, f(x_i))$$

In [None]:
model = LinearSVC()

Select only one class 

In [None]:
model.fit(train_X, train_y==target_class)

In [None]:
predictions = model.predict(test_X)

In [None]:
print('Accuracy %.3f%%' % np.mean(predictions == (test_y == 1)))

In [None]:
positive_predictions = predictions[test_y == 1]
negative_predictions = predictions[test_y != 1]

In [None]:
print("Negative elements accuracy %.3f%%" % (100 * (1 - negative_predictions.mean())))

In [None]:
print("Positive elements accuracy %.3f%%" % (100 * positive_predictions.mean()))

In [None]:
decision_values = model.decision_function(test_X)

In [None]:
from sklearn.metrics import hinge_loss

In [None]:
print("Hinge loss %.3f" % hinge_loss(test_y == 1, decision_values))

In [None]:
negative_decision_values = decision_values[test_y != 1]
hinge_negative = np.mean(np.maximum(0, 1 + negative_decision_values))
print("Negative elements hinge loss %.3f" % hinge_negative)

In [None]:
positive_decision_values = decision_values[test_y == 1]
hinge_positive = np.mean(np.maximum(0, 1 - positive_decision_values))
print("Positive elements hinge loss %.3f" % hinge_positive)

## Resampling Strategies

In [None]:
# Use code below to install imbalanced-learn package or do it manually with conda install -c conda-forge imbalanced-learn  
!pip install --upgrade pip 
!pip install PyHamcrest
!pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
balancer = RandomOverSampler()

In [None]:
balanced_train_x, balanced_train_y = balancer.fit_sample(train_X, train_y == 1)

In [None]:
model = LinearSVC()
model.fit(balanced_train_x, balanced_train_y)

In [None]:
predictions = model.predict(test_X)

In [None]:
print('Accuracy %.3f%%' % (100 * np.mean(predictions == (test_y == 1))))

In [None]:
positive_predictions = predictions[test_y == 1]
negative_predictions = predictions[test_y != 1]

In [None]:
print("Negative elements accuracy %.3f%%" % (100 * (1 - negative_predictions.mean())))

In [None]:
print("Positive elements accuracy %.3f%%" % (100 * positive_predictions.mean()))

## Rebalanced Multiclass Classification

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin

In [None]:
class BalancedLearner(BaseEstimator, ClassifierMixin):
    
    def __init__(self, classifier=LinearSVC, balancer=SMOTE):
        self.classifier = classifier
        self.balancer = balancer
        
    def fit(self, X, y):
        train_X, train_y = self.balancer.fit_sample(X, y)
        self.classifier.fit(train_X, train_y)

    
    def decision_function(self, X):
        return self.classifier.decision_function(X)

In [None]:
rebalancer = SMOTE()

In [None]:
model_b = BalancedLearner(clone(model), rebalancer)

In [None]:
ovr_classifier = OneVsRestClassifier(clone(model_b))
ovr_classifier.fit(train_X, train_y)

In [None]:
predict_y_balanced = ovr_classifier.predict(test_X)
pd.DataFrame(confusion_matrix(test_y, predict_y_balanced))

In [None]:
from IPython.core import display as ICD

print('Balanced')
ICD.display(pd.DataFrame(confusion_matrix(test_y, predict_y_balanced)))
print('Original')
ICD.display(pd.DataFrame(confusion_matrix(test_y, predict_y)))
