## Train and Test Sets by Splitting Learn and Test Data

In [1]:
import numpy as np
from sklearn.datasets import load_iris

iris = load_iris()

In [2]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [3]:
indices = np.random.permutation(len(iris.data))
indices

array([ 18,  28,  86,  93, 102,  30,  56,  43,  46,  98,  83, 141,  88,
       120,  53,  27,   9,  95, 106, 134, 116,   6, 121, 101,  64, 139,
       110, 123,  60,  66,  80,  85,  71, 138,  62,  59,  76,  49,  50,
        65,  13,  47, 119,  57,  16,  70,  41, 140, 142,  63, 132,  29,
        58,  99, 130, 148,  81,  15, 117,  25, 126,  90,  94, 122,  14,
       146,  52,  78,  35,   8,  42,  39,  38,   7, 118,  23,  33, 111,
        97, 125,  34,  17,   4,  61,  54, 124, 107,  69,  72,  21,  10,
        32, 131,  67,  84,  20,  87,  48, 103, 109,   0,   3,  82, 112,
        77,  79,  31, 113,   1,  36,  19, 144, 143,  75,  44, 108,  45,
        92, 129,   2, 114,  96, 145,  12, 128,   5, 127, 105,  40, 137,
        74,  11,  22, 100, 149,  37, 133,  68,  91, 104,  26,  24,  73,
       135,  55, 115, 136,  51,  89, 147])

In [4]:
n_test_samples = 12
learnset_data = iris.data[indices[:-n_test_samples]]
learnset_labels = iris.target[indices[:-n_test_samples]]

testset_data =  iris.data[indices[-n_test_samples:]]
testset_labels = iris.target[indices[-n_test_samples:]]

print(learnset_data[:4], learnset_labels[:4])
print(testset_data[:4], testset_labels[:4])

[[5.7 3.8 1.7 0.3]
 [5.2 3.4 1.4 0.2]
 [6.7 3.1 4.7 1.5]
 [5.  2.3 3.3 1. ]] [0 0 1 1]
[[6.1 3.  4.6 1.4]
 [6.5 3.  5.8 2.2]
 [5.  3.4 1.6 0.4]
 [4.8 3.4 1.9 0.2]] [1 2 0 0]


### Splits with Sklearn

In [6]:
from sklearn.model_selection import train_test_split

iris = load_iris()

data, labels = iris.data, iris.target

res = train_test_split(data, labels,
                        train_size=0.8,
                        test_size = 0.2,
                        random_state=42)

train_data, test_data, train_labels, test_labels = res

n = 7

print(f'The first {n} data sets:')
print(test_data[:n])
print(f'The Corresponding {n} labels:')
print(test_labels[:n])

The first 7 data sets:
[[6.1 2.8 4.7 1.2]
 [5.7 3.8 1.7 0.3]
 [7.7 2.6 6.9 2.3]
 [6.  2.9 4.5 1.5]
 [6.8 2.8 4.8 1.4]
 [5.4 3.4 1.5 0.4]
 [5.6 2.9 3.6 1.3]]
The Corresponding 7 labels:
[1 0 2 1 1 0 1]


## Stratified Random Sample

In [7]:
print('All:', np.bincount(labels) / float(len(labels)) * 100.0)
print('Training:', np.bincount(train_labels) / float(len(train_labels)) * 100.0)

print('Test:', np.bincount(test_labels) / float(len(test_labels)) * 100.0)

All: [33.33333333 33.33333333 33.33333333]
Training: [33.33333333 34.16666667 32.5       ]
Test: [33.33333333 30.         36.66666667]


In [8]:
iris = load_iris()

data, labels = iris.data, iris.target

res = train_test_split(data, labels,
                        train_size=0.8,
                        test_size=0.2,
                        random_state=42,
                        stratify=labels)

train_data, test_data, train_labels, test_labels = res

print('All:', np.bincount(labels) / float(len(labels)) * 100.0)
print('Training:', np.bincount(train_labels) / float(len(train_labels)) * 100.0)

print('Test:', np.bincount(test_labels) / float(len(test_labels)) * 100.0)

All: [33.33333333 33.33333333 33.33333333]
Training: [33.33333333 33.33333333 33.33333333]
Test: [33.33333333 33.33333333 33.33333333]
