## Sformułowanie problemu

Rozważamy problem uczenia klasyfikatora danych sekwencyjnych, który ciągowi 
$ (x_1,\dots x_{T_0}) $ przyporządkowuje ciąg $(y_1,\dots y_{T_0})$, ale bez dostępu do podpisanego zbioru treningowego $\mathcal{D}_{XY} = \lbrace(x_1^n, \dots, x_{T_n}^n), (y_1^n, \dots,y_{T_n}^n) \ : \ n=1,\dots, M \rbrace $ (tutaj $T_n$ oznacza długość $n$-tego ciągu), a jedynie do:
* zbioru niepodpisanych danych: $\mathcal{D}_X = \lbrace(x_1^n, \dots, x_{T_n}^n)\ : \ n=1,\dots, M \rbrace$,
* modelu n-gram: $p_{LM}(i_1,\dots ,i_N) = p_{LM}(y_{t-N+1}^n=i_1, \dots, y_t^n=i^N)$,

gdzie $i_1, \dots, i_N$ są elementami sekwencji (np. słowami/literami), a subskrypt $LM$ oznacza model językowy (*Language Model*).

In [0]:
import os
import torch
import torchvision
import numpy as np

batch_size = 2
data_path = './data'

transform = torchvision.transforms.Compose(
    [torchvision.transforms.ToTensor(),
     torchvision.transforms.Normalize((0.1307,), (0.3081,)),
    ])

_test = torchvision.datasets.MNIST(
    data_path, train=False, download=True, transform=transform)

_train = torchvision.datasets.MNIST(
    data_path, train=True, download=True, transform=transform)
_train.train_data = _train.train_data[:5]
_train.train_labels = _train.train_labels[:5]

_valid = torchvision.datasets.MNIST(
    data_path, train=True, download=True, transform=transform)
_valid.train_data = _valid.train_data[50000:]
_valid.train_labels = _valid.train_labels[50000:]

mnist_loaders = {
    'train': torch.utils.data.DataLoader(
        _train, batch_size=batch_size, shuffle=True,
        pin_memory=True, num_workers=10),
    'valid': torch.utils.data.DataLoader(
        _valid, batch_size=batch_size, shuffle=False),
    'test': torch.utils.data.DataLoader(
        _test, batch_size=batch_size, shuffle=False)}

In [8]:
import torch
from torchvision import datasets, transforms
import random
import numpy as np
from random import randint as rint
from imageio import imwrite
import os
import datetime
import random
from matplotlib import pyplot as plt

class args:
    N = 10
    M = 10
    root_path = './dataset'


N = args.N # number of digits in the contiguous sequence
M = args.M # number of samples

data = datasets.MNIST('./MNIST', train=True, download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
            ]))


dataset_data = np.zeros((M, 28, 0))
dataset_labels = np.zeros((M, 28, 0))
s = np.append(data.train_labels.view(-1,1,1).repeat(1,28,1).numpy(), data.train_data.numpy(), axis=2)
print(s.shape)
# print(s[0])
for i in range(N):
    p = np.random.permutation(s)[:M]
    d = p[:,:,1:]
    if i == 0:
        dataset_data = d
    else:
        dataset_data = np.append(dataset_data, d, axis=2)
    dataset_labels = np.append(dataset_labels, p[:,:,0:1], axis=2)

dataset_labels = dataset_labels[:,0,:]
print(dataset_labels.shape)
# Creates a dataset of 60000 (28*N + (N-1)*overlap) * 36 images
# containing N numbers in sequence and their labels
images = []
if not os.path.exists('./images'): os.makedirs('./images')
for i in range(M):
    img = np.zeros((28, 0))
    # probs = torch.Tensor(range(0, N + 1))
    if i == 0: print(img.shape)
    img = dataset_data[i,:,:]
    images.append(img)
    if False:
        name = './images/img_' + ''.join(map(lambda x: str(int(x)), dataset_labels[i])) + '.png'
        imwrite(name, img.clip(0, 255).astype('uint8'))

dataset_data = np.array(images) / 255.0

t = datetime.datetime.now().time()
if not os.path.exists(args.root_path): os.makedirs(args.root_path)
data_path = args.root_path + "data_" + str(N) + "_" + str(M) + ".npy"
np.save(data_path, dataset_data)
print("Saved: ", data_path)
label_path = args.root_path + "labels_" + str(N) + "_" + str(M) + ".npy"
np.save(label_path, dataset_labels)
print("Saved: ", label_path)

(60000, 28, 29)
(10, 10)
(28, 0)
Saved:  ./datasetdata_10_10.npy
Saved:  ./datasetlabels_10_10.npy


In [0]:
import torch
from torchvision import datasets, transforms
import random
import numpy as np
from random import randint as rint
from scipy.misc import imsave
# from PIL import Image
import os
import datetime
import random
from matplotlib import pyplot as plt

class args:
    N = 15
    M = 2
    root_path = './dataset'


N = args.N # number of digits in the contiguous sequence
M = args.M # number of samples

# space = range(200, 10000)
# overlap = range(15, 25) # bigger -> more overlapped
space = range(200, 201)
overlap = range(10, 11) # bigger -> more overlapped

data = datasets.MNIST('./MNIST', train=True, download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
            ]))


dataset_data = np.zeros((M, N, 28, 28))
dataset_labels = np.zeros((M, N))

for i in range(M):
    p = np.random.choice(np.arange(60000), size=N)
    print(p)
    dataset_data[i,:,:,:] = data.train_data.numpy()[p] 
    dataset_labels[i,:] = data.train_labels.numpy()[p]

    
images = []
if not os.path.exists('./images'): os.makedirs('./images')
for i in range(M):
    img = np.zeros((28, 0))
    # probs = torch.Tensor(range(0, N + 1))
    for j in range(N):
        img = np.append(img, dataset_data[i,j,:,:], axis=1)
    images.append(img)
    name = './images/img_' + ''.join(map(lambda x: str(int(x)), dataset_labels[i])) + '.png'
    imsave(name, img.clip(0, 255))

dataset_data = np.array(images) / 255.0

if not os.path.exists(args.root_path): os.makedirs(args.root_path)
data_path = args.root_path + "/data_" + str(N) + "_" + str(M) + ".npy"
np.save(data_path, dataset_data)
print("Saved: ", data_path)
label_path = args.root_path + "/labels_" + str(N) + "_" + str(M) + ".npy"
np.save(label_path, dataset_labels)
print("Saved: ", label_path)

[ 4061 36119 20219 14537 21177 18056 42388 50108 45059  6006 14678 56444
 37762 46456 56450]
[16803 15269 42161 35700 47915 52922  9290 23722 54513 32684 34808 55108
  2420 19423 30286]


`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.


Saved:  ./dataset/data_15_2.npy
Saved:  ./dataset/labels_15_2.npy
