# Supervised Learning

In [1]:
# %load utils/imports.py
%matplotlib inline

import numpy as np
import pandas as pd

from utils.styles import *

# Balancing Datasets

In [74]:
def balance_plot(df,labels='index',values='categories'):
    value_counts = df[values].value_counts()
    balanced = len(set(value_counts)) == 1
    title= "{} Dataset".format(["Unbalanced","Balanced"][balanced])
    n = value_counts.sum()
    value_counts.reset_index().iplot(
        kind='pie',labels=labels,values=values,
        title='<b>{}</b><br>{} records'.format(title, n),
        theme='white')

In [75]:
# Number of records
n = 100
# Generate sample data
df = cf.datagen.bubble(n_categories=2,n=int(n/2))
# Create unbalanced classes
df.categories = (np.random.rand(n) > 0.7).astype(int)

In [76]:
balance_plot(df)

In [121]:
def under_sample(df,labels="categories"):
    """Only works for binary classes"""
    value_counts = df[labels].value_counts()
    
    max_cls = value_counts.idxmax()
    min_cls = value_counts.idxmin()
    min_size = value_counts[min_cls]
    
    min_idx = df[df[labels]== min_cls].index
    max_idx = df[df[labels] == max_cls].sample(min_size).index
    new_idx = np.r_[min_idx, max_idx]
    
    return df.loc[new_idx]

In [122]:
balance_plot(under_sample(df))

In [125]:
def over_sample(df,labels="categories"):
    """Only works for binary classes"""
    value_counts = df[labels].value_counts()
    
    max_cls = value_counts.idxmax()
    min_cls = value_counts.idxmin()
    max_size = value_counts[max_cls]
    
    min_idx = df[df[labels] == min_cls].sample(max_size, replace=True).index
    max_idx = df[df[labels]== max_cls].index 
    new_idx = np.r_[min_idx, max_idx]
    
    return df.loc[new_idx]

In [126]:
balance_plot(over_sample(df))

### Resampling

Scikit-learn provides us with a simple way of performing resampling on our datasets through their [`cross-validation module`](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.cross_validation).

Cross-validation is simply a way of breaking up your features into training and test sets. There are a number of different cross-validation tools provided. The simplest is [`KFold`](http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html#sklearn.cross_validation.KFold) which simply breaks your training and test sets into `k`-folds or groups of training and test sets.

Another common tool that is useful for resolving unbalanced datasets is the [`StratifiedKFold`](http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedKFold.html#sklearn.cross_validation.StratifiedKFold). Stratified K-Folds cross-validation also splits your data into `k`-folds, but it ensures that the folds are made by preserving the percentage of samples from **each class**. In other words, if you have a 4:1 split between your classes it will ensure that each fold has a 4:1 split instead of some folds possibly getting a 10:1 split and some getting a 1:1 split.

In [221]:
from sklearn.cross_validation import KFold, StratifiedKFold

In [222]:
def print_folds(df, sampler, labels='categories', verbose=True, graph=False):
    """Print each of the folds from a SciKit-Learn sampling method"""
    for idx, (train_index, test_index) in enumerate(list(sampler)):
        if verbose:
            print("TRAIN:", train_index, "TEST:", test_index)
            print("Y@TRAIN:", df.ix[train_index,labels].values,
                  "Y@TEST:", df.ix[test_index,labels].values)
        if graph:
            print('Fold {}'.format(idx))
            balance_plot(df.iloc[train_index])
        else:
            train = df.iloc[train_index]
            print('TRAIN - Fold {}'.format(idx))
            display(train)
            test  = df.iloc[test_index]
            print('TEST - Fold {}'.format(idx))
            display(test)

#### KFold

In [230]:
n_folds = 3
n = 5

sample_kfold = KFold(n=n, n_folds=n_folds, shuffle=True, random_state=0)

In [231]:
print_folds(df, sample_kfold)

TRAIN: [1 3 4] TEST: [0 2]
Y@TRAIN: [1 1 0] Y@TEST: [0 0]
TRAIN - Fold 0


Unnamed: 0,categories,size,text,x,y
1,1,70,OSX.NC,-0.224588,-0.586702
3,1,31,KAD.XY,-0.492182,0.791428
4,0,77,JHC.DF,-0.334214,0.961898


TEST - Fold 0


Unnamed: 0,categories,size,text,x,y
0,0,21,BIX.WD,0.832503,-0.103626
2,0,64,WTR.NP,-0.720996,-0.048732


TRAIN: [0 2 4] TEST: [1 3]
Y@TRAIN: [0 0 0] Y@TEST: [1 1]
TRAIN - Fold 1


Unnamed: 0,categories,size,text,x,y
0,0,21,BIX.WD,0.832503,-0.103626
2,0,64,WTR.NP,-0.720996,-0.048732
4,0,77,JHC.DF,-0.334214,0.961898


TEST - Fold 1


Unnamed: 0,categories,size,text,x,y
1,1,70,OSX.NC,-0.224588,-0.586702
3,1,31,KAD.XY,-0.492182,0.791428


TRAIN: [0 1 2 3] TEST: [4]
Y@TRAIN: [0 1 0 1] Y@TEST: [0]
TRAIN - Fold 2


Unnamed: 0,categories,size,text,x,y
0,0,21,BIX.WD,0.832503,-0.103626
1,1,70,OSX.NC,-0.224588,-0.586702
2,0,64,WTR.NP,-0.720996,-0.048732
3,1,31,KAD.XY,-0.492182,0.791428


TEST - Fold 2


Unnamed: 0,categories,size,text,x,y
4,0,77,JHC.DF,-0.334214,0.961898


#### StratifiedKFold

In [232]:
y = df.categories
sample_strat_kfold = StratifiedKFold(y=y, n_folds=n_folds, shuffle=True, random_state=0)

Let's first have another look at what KFold could end up doing

In [233]:
print_folds(df, sample_kfold, verbose=False, graph=True)

Fold 0


Fold 1


Fold 2


Whoops! That's one fold which is going to be trained on a single class - not very helpful in learning anything about the second class.

With Stratified KFolds this is resolved.

In [234]:
print_folds(df, sample_strat_kfold, verbose=False, graph=True)

Fold 0


Fold 1


Fold 2
