In [5]:
import random
from typing import TypeVar, List, Tuple
X = TypeVar('X')

def split_data(data: List[X], prob: float) -> Tuple[List[X], List[X]]: # Dividir dados em frações [prob, 1 - prob]
    data = data[:]
    random.shuffle(data) # modificando a lista
    cut = int(len(data) * prob) # encontrando limiar
    return data[:cut], data[cut:] # dividindo a lista aleatória nesse ponto

data = [n for n in range(1000)]
train, test = split_data(data, 0.75)
assert len(train) == 750
assert len(test) == 250
assert sorted(train + test) == data

In [6]:
# Para o caso de haverem pares de variáveis de entrada e saída
Y = TypeVar('Y')
def train_test_split(xs: List[X], ys: List[Y], test_pct: float):
    idxs = [i for i in range(len(xs))]
    train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
    return ([xs[i] for i in train_idxs], # x_train
            [xs[i] for i in test_idxs], # x_test
            [ys[i] for i in train_idxs], # y_train
            [ys[i] for i in test_idxs]) # y_test

xs = [x for x in range(1000)]
ys = [2 * x for x in xs]
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.25)
assert len(x_train) == len(y_train) == 750
assert len(x_test) == len(y_test) == 250
assert all(y == 2 * x for x, y in zip(x_train, y_train))
assert all(y == 2 * x for x, y in zip(x_test, y_test))

Depois disso, podemos fazer algo como:

``` python
model = SomeKindOfModel()

x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.33) model.train(x_train, y_train)

performance = model.test(x_test, y_test)
```

Se a performance for ruim, provavelmente houve sobreajuste dos dados.