In [1]:
# import libraries
import numpy as np
from sklearn.model_selection import train_test_split

# Fake dataset

In [2]:
fakedata = np.tile(np.array([1,2,3,4]),(10,1)) + np.tile(10*np.arange(1,11),(4,1)).T
fakelabels = np.arange(10)>4
print(fakedata), print(' ')
print(fakelabels)

[[ 11  12  13  14]
 [ 21  22  23  24]
 [ 31  32  33  34]
 [ 41  42  43  44]
 [ 51  52  53  54]
 [ 61  62  63  64]
 [ 71  72  73  74]
 [ 81  82  83  84]
 [ 91  92  93  94]
 [101 102 103 104]]
 
[False False False False False  True  True  True  True  True]


# Using train_test_split

In [3]:
# Specify sizes of the partitions
# Order is train, devset, test
partitions = [0.8, 0.1, 0.1]

# Split the data (note the third input, and the TMP in the variable name)
train_data, temp_data, train_labels, temp_label \
    = train_test_split(fakedata, fakelabels, train_size=partitions[0])

# Now split the TMP data
split = partitions[1] / np.sum(partitions[1:])
dev_data, test_data, dev_labels, test_labels \
    = train_test_split(temp_data, temp_label, train_size=split)

# Print out the sizes
print(f'Training set size: {train_data.shape}')
print(f'Dev set size: {dev_data.shape}')
print(f'Test set size: {test_data.shape}')
print('-------------------------------------------')
# Print out the train/test data
print(train_data, '\n')
print(dev_data, '\n')
print(test_data, '\n')

Training set size: (8, 4)
Dev set size: (1, 4)
Test set size: (1, 4)
-------------------------------------------
[[ 41  42  43  44]
 [ 71  72  73  74]
 [ 21  22  23  24]
 [ 11  12  13  14]
 [ 51  52  53  54]
 [ 61  62  63  64]
 [ 31  32  33  34]
 [101 102 103 104]] 

[[91 92 93 94]] 

[[81 82 83 84]] 



# Splitting the data manually using numpy

In [4]:
# Partition sizes in proportion
partitions = np.array([0.8, 0.1, 0.1])

# Convert those into integers
partition_bound = np.cumsum(partitions * len(fakelabels)).astype(int)
print(f'Partition boundaries: {partition_bound}')

# Random indices
rand_indices = np.random.permutation(range(len(fakelabels)))
print(f'Randomized data indices: {rand_indices}')

Partition boundaries: [ 8  9 10]
Randomized data indices: [8 3 1 5 6 9 2 0 4 7]


In [6]:
# Select rows for the training data
train_dataN = fakedata[rand_indices[:partition_bound[0]], :]
train_labelsN = fakelabels[rand_indices[:partition_bound[0]]]

# Select rows for the devset data
dev_dataN = fakedata[rand_indices[partition_bound[0]:partition_bound[1]], :]
dev_labelsN = fakelabels[rand_indices[partition_bound[0]:partition_bound[1]]]

# Select rows for the test data
test_dataN = fakedata[rand_indices[partition_bound[1]:partition_bound[2]], :]
test_labelsN = fakelabels[rand_indices[partition_bound[1]:partition_bound[2]]]

In [7]:
# Print out the sizes
print(f'Training set size: {train_dataN.shape}')
print(f'Dev set size: {dev_dataN.shape}')
print(f'Test set size: {test_dataN.shape}')
print('-------------------------------------------')
# Print out the train/test data
print(train_dataN, '\n')
print(dev_dataN, '\n')
print(test_dataN, '\n')

Training set size: (8, 4)
Dev set size: (1, 4)
Test set size: (1, 4)
-------------------------------------------
[[ 91  92  93  94]
 [ 41  42  43  44]
 [ 21  22  23  24]
 [ 61  62  63  64]
 [ 71  72  73  74]
 [101 102 103 104]
 [ 31  32  33  34]
 [ 11  12  13  14]] 

[[51 52 53 54]] 

[[81 82 83 84]] 

