https://machinelearningmastery.com/undersampling-algorithms-for-imbalanced-classification/

## Bibliothèques

In [1]:
from collections import Counter

from sklearn.datasets import make_classification

from matplotlib import pyplot

from numpy import where

import imblearn

Using TensorFlow backend.


## [ 01 ] Examples to keep

### [ 01.1 ] Near Miss Undersampling

In [13]:
from imblearn.under_sampling import NearMiss

In [17]:
X , y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

In [18]:
undersample = NearMiss( version = 3 , n_neighbors = 3 )

In [19]:
X , y = undersample.fit_resample( X , y )

In [20]:
Counter( y )

(200, 2)

### [ 01.2 ] Condensed Nearest Neighbor ( CNN )

In [49]:
from imblearn.under_sampling import CondensedNearestNeighbour

In [50]:
X , y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

In [51]:
undersample = CondensedNearestNeighbour( n_neighbors = 1 )

In [52]:
X, y = undersample.fit_resample(X, y)

In [53]:
Counter( y )

Counter({0: 188, 1: 100})

## [ 02 ] Examples to delete

### [ 02.1 ] Tomek Links for Undersampling

In [44]:
from imblearn.under_sampling import TomekLinks

In [45]:
X , y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

In [46]:
undersample = TomekLinks()

In [47]:
X , y = undersample.fit_resample(X , y)

In [48]:
Counter( y )

Counter({0: 9874, 1: 100})

### [ 02.2 ] Edited Nearest Neighbors ( ENN )

In [39]:
from imblearn.under_sampling import EditedNearestNeighbours

In [40]:
X , y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

In [41]:
undersample = EditedNearestNeighbours(n_neighbors=3)

In [42]:
X, y = undersample.fit_resample(X, y)

In [43]:
Counter( y )

Counter({0: 9806, 1: 100})

## [ 03 ] Combinations of Keep and Delete Methods

### [ 03.1 ] One-Sided Selection ( OSS )

In [54]:
from imblearn.under_sampling import OneSidedSelection

In [55]:
X , y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

In [56]:
undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=200)

In [57]:
X, y = undersample.fit_resample(X, y)

In [58]:
Counter( y )

Counter({0: 1054, 1: 100})

### [ 03.2 ] Neighborhood Cleaning Rule ( NCR )

In [59]:
from imblearn.under_sampling import NeighbourhoodCleaningRule

In [60]:
X , y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

In [61]:
undersample = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5)

In [62]:
X, y = undersample.fit_resample(X, y)

In [63]:
Counter( y )

Counter({0: 9786, 1: 100})