In [467]:
import torch
import torch.nn as nn

import pprint

pp = pprint.PrettyPrinter()

In [468]:
list_of_lists = [
    [1, 2, 3],
    [4, 5, 6],
]
pp.pprint(list_of_lists)
print(list_of_lists)

[[1, 2, 3], [4, 5, 6]]
[[1, 2, 3], [4, 5, 6]]


In [469]:
data = torch.tensor(list_of_lists)
pp.pprint(data)

data = torch.tensor([[0, 1], [2, 3], [4, 5]])
pp.pprint(data)

tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([[0, 1],
        [2, 3],
        [4, 5]])


In [470]:
data = torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.float32)
pp.pprint(data)

tensor([[0., 1.],
        [2., 3.],
        [4., 5.]])


In [471]:
zeros = torch.zeros(2, 5, dtype=torch.int16)
pp.pprint(zeros)

tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]], dtype=torch.int16)


In [472]:
rr = torch.arange(1, 10)
pp.pprint(rr)

tensor([1, 2, 3, 4, 5, 6, 7, 8, 9])


In [473]:
rr + 2

tensor([ 3,  4,  5,  6,  7,  8,  9, 10, 11])

In [474]:
rr * 2

tensor([ 2,  4,  6,  8, 10, 12, 14, 16, 18])

In [475]:
a = torch.tensor([[1, 2], [2, 3], [4, 5]])  # (3, 2)
b = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]])  # (2, 4)

print("A is", a)
print("B is", b)
print("The product is", a.matmul(b))  # (3, 4)
print("The other product is", a @ b)  # +, -, *, @

A is tensor([[1, 2],
        [2, 3],
        [4, 5]])
B is tensor([[1, 2, 3, 4],
        [5, 6, 7, 8]])
The product is tensor([[11, 14, 17, 20],
        [17, 22, 27, 32],
        [29, 38, 47, 56]])
The other product is tensor([[11, 14, 17, 20],
        [17, 22, 27, 32],
        [29, 38, 47, 56]])


In [476]:
matr_3d = torch.tensor(
    [
        [[1, 2, 3, 4], [-2, 5, 6, 9]],
        [[5, 6, 7, 2], [8, 9, 10, 4]],
        [[-3, 2, 2, 1], [4, 6, 5, 9]],
    ]
)
pp.pprint(matr_3d)
pp.pprint(matr_3d.shape)

tensor([[[ 1,  2,  3,  4],
         [-2,  5,  6,  9]],

        [[ 5,  6,  7,  2],
         [ 8,  9, 10,  4]],

        [[-3,  2,  2,  1],
         [ 4,  6,  5,  9]]])
torch.Size([3, 2, 4])


In [477]:
rr = torch.arange(1, 16)
print("The shape is currently", rr.shape)
print("The contents are currently", rr)
print()
rr = rr.view(5, 3)
print("After reshaping, the shape is currently", rr.shape)
print("The contents are currently", rr)

The shape is currently torch.Size([15])
The contents are currently tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

After reshaping, the shape is currently torch.Size([5, 3])
The contents are currently tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12],
        [13, 14, 15]])


In [478]:
data = torch.arange(1, 36, dtype=torch.float32).reshape(5, 7)
print("Data is:", data)

# We can perform operations like *sum* over each row...
print("Taking the sum over rows:")
print(data.sum(dim=1))  # (5,)

# or over each column.
print("Taking thep sum over columns:")
print(data.sum(dim=0))  # (7,)

# Other operations are available:
print("Taking the stdev over rows:")
print(data.std(dim=1))

# Other operations are available:
print("Taking the stdev over columns:")
print(data.std(dim=0))

Data is: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19., 20., 21.],
        [22., 23., 24., 25., 26., 27., 28.],
        [29., 30., 31., 32., 33., 34., 35.]])
Taking the sum over rows:
tensor([ 28.,  77., 126., 175., 224.])
Taking thep sum over columns:
tensor([ 75.,  80.,  85.,  90.,  95., 100., 105.])
Taking the stdev over rows:
tensor([2.1602, 2.1602, 2.1602, 2.1602, 2.1602])
Taking the stdev over columns:
tensor([11.0680, 11.0680, 11.0680, 11.0680, 11.0680, 11.0680, 11.0680])


In [479]:
data = torch.tensor([[1, 2.2, 9.6], [4, -7.2, 6.3]])
data_row_mean = data.mean(dim=1)
data_col_mean = data.mean(dim=0)
pp.pprint(data_row_mean)
pp.pprint(data_row_mean.shape)
pp.pprint(data_col_mean)
pp.pprint(data_col_mean.shape)

tensor([4.2667, 1.0333])
torch.Size([2])
tensor([ 2.5000, -2.5000,  7.9500])
torch.Size([3])


In [480]:
x = torch.tensor([2.0], requires_grad=True)

pp.pprint(x.grad)

None


In [481]:
y = x * x * 3
y.backward()
pp.pprint(x.grad)

tensor([12.])


In [482]:
z = x * x * 3
z.backward()
pp.pprint(x.grad)

tensor([24.])


In [483]:
import torch.nn as nn

In [484]:
# Create the inputs
input = torch.ones(2, 3, 4)
# N* H_in -> N*H_out


# Make a linear layers transforming N,*,H_in dimensinal inputs to N,*,H_out
# dimensional outputs
linear = nn.Linear(4, 3)
linear_output = linear(input)
linear_output

tensor([[[-0.0392,  0.3787,  0.1040],
         [-0.0392,  0.3787,  0.1040],
         [-0.0392,  0.3787,  0.1040]],

        [[-0.0392,  0.3787,  0.1040],
         [-0.0392,  0.3787,  0.1040],
         [-0.0392,  0.3787,  0.1040]]], grad_fn=<ViewBackward0>)

In [485]:
A = torch.arange(2 * 3 * 4).reshape((2, 3, 4))
B = torch.arange(2 * 5 * 4 * 5).reshape((5, 2, 4, 5))
C = torch.arange(20 * 5 * 5 * 2 * 4 * 6).reshape((20, 5, 5, 2, 4, 6))
D = torch.arange(20 * 5 * 5 * 2 * 6 * 10).reshape((20, 5, 5, 2, 6, 10))
torch.matmul(A, B).shape

torch.Size([5, 2, 3, 5])

In [486]:
list(linear.parameters())

[Parameter containing:
 tensor([[ 0.0800, -0.1673,  0.0630,  0.1961],
         [ 0.4169, -0.3202,  0.2804, -0.1184],
         [ 0.0061,  0.1198,  0.4542, -0.4934]], requires_grad=True),
 Parameter containing:
 tensor([-0.2111,  0.1199,  0.0173], requires_grad=True)]

In [487]:
linear_output

tensor([[[-0.0392,  0.3787,  0.1040],
         [-0.0392,  0.3787,  0.1040],
         [-0.0392,  0.3787,  0.1040]],

        [[-0.0392,  0.3787,  0.1040],
         [-0.0392,  0.3787,  0.1040],
         [-0.0392,  0.3787,  0.1040]]], grad_fn=<ViewBackward0>)

In [488]:
relu = nn.LeakyReLU()
output = relu(linear_output)
output

tensor([[[-0.0004,  0.3787,  0.1040],
         [-0.0004,  0.3787,  0.1040],
         [-0.0004,  0.3787,  0.1040]],

        [[-0.0004,  0.3787,  0.1040],
         [-0.0004,  0.3787,  0.1040],
         [-0.0004,  0.3787,  0.1040]]], grad_fn=<LeakyReluBackward0>)

In [489]:
block = nn.Sequential(nn.Linear(4, 2), nn.Sigmoid())

input = torch.ones(2, 3, 4)
output = block(input)
output

tensor([[[0.4892, 0.4614],
         [0.4892, 0.4614],
         [0.4892, 0.4614]],

        [[0.4892, 0.4614],
         [0.4892, 0.4614],
         [0.4892, 0.4614]]], grad_fn=<SigmoidBackward0>)

In [490]:
class MultilayerPerceptron(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MultilayerPerceptron, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size

        self.model = nn.Sequential(
            nn.Linear(self.input_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.input_size),
            nn.Sigmoid(),
        )

    def forward(self, x):
        output = self.model(x)
        return output


input = torch.randn(2, 5)
model = MultilayerPerceptron(5, 3)

model(input)

tensor([[0.4599, 0.4472, 0.3817, 0.5377, 0.5676],
        [0.4556, 0.4115, 0.3895, 0.5498, 0.6125]], grad_fn=<SigmoidBackward0>)

In [491]:
list(model.named_parameters())

[('model.0.weight',
  Parameter containing:
  tensor([[-0.0322,  0.3580,  0.2086, -0.2141,  0.3808],
          [ 0.1602,  0.3442, -0.3442,  0.1515,  0.3330],
          [-0.3538, -0.0952,  0.0685,  0.3144,  0.3318]], requires_grad=True)),
 ('model.0.bias',
  Parameter containing:
  tensor([ 0.3614,  0.3961, -0.3199], requires_grad=True)),
 ('model.2.weight',
  Parameter containing:
  tensor([[ 0.0139, -0.1531,  0.0183],
          [-0.3909,  0.1833, -0.3987],
          [ 0.1113, -0.1064, -0.4570],
          [-0.0267,  0.3975,  0.1471],
          [ 0.4852, -0.1944,  0.5328]], requires_grad=True)),
 ('model.2.bias',
  Parameter containing:
  tensor([-0.0956, -0.1618, -0.4720, -0.0214,  0.1946], requires_grad=True))]

In [492]:
import torch.optim as optim

In [493]:
y = torch.ones(10, 5)
z = torch.randn_like(y)
x = y + z
pp.pprint(x)

tensor([[ 1.0328,  2.6049, -1.6300,  2.1344,  0.2627],
        [ 0.7165,  2.1657,  1.8893,  0.4129,  0.9372],
        [ 1.1807,  0.6802, -1.8700,  0.8503,  0.2655],
        [ 0.9264,  0.8395,  2.2977,  0.2667,  1.3267],
        [ 1.1307,  0.8110,  1.5679,  1.0221,  0.0785],
        [ 0.5014,  0.5947,  2.8493,  1.5814,  2.3587],
        [ 0.7254, -0.5478, -0.6408,  0.1701,  0.1744],
        [ 0.4876,  2.1962,  0.7475,  1.7778,  3.5014],
        [-0.6083,  1.8779,  2.0250,  2.0709, -0.4616],
        [ 1.0606,  3.5305,  2.7873,  0.3350,  3.0163]])


In [494]:
model = MultilayerPerceptron(5, 3)

adam = optim.Adam(model.parameters(), lr=1e-1)

loss_function = nn.MSELoss()

y_pred = model(x)
loss_function(y_pred, y).item()

0.24508769810199738

In [495]:
n_epoch = 100
for epoch in range(n_epoch):
    adam.zero_grad()
    y_pred = model(x)
    loss = loss_function(y_pred, y)
    print(f"Epoch {epoch}: traing loss: {loss}")
    loss.backward()
    adam.step()

Epoch 0: traing loss: 0.24508769810199738
Epoch 1: traing loss: 0.18516084551811218
Epoch 2: traing loss: 0.1216031089425087
Epoch 3: traing loss: 0.06360343098640442
Epoch 4: traing loss: 0.03253542631864548
Epoch 5: traing loss: 0.01589490845799446
Epoch 6: traing loss: 0.007973880507051945
Epoch 7: traing loss: 0.003646576078608632
Epoch 8: traing loss: 0.0016105301911011338
Epoch 9: traing loss: 0.0007155617931857705
Epoch 10: traing loss: 0.0003285373386461288
Epoch 11: traing loss: 0.0001579771633259952
Epoch 12: traing loss: 7.97101019998081e-05
Epoch 13: traing loss: 4.20837095589377e-05
Epoch 14: traing loss: 2.31685335165821e-05
Epoch 15: traing loss: 1.3263455912237987e-05
Epoch 16: traing loss: 7.878484211687464e-06
Epoch 17: traing loss: 4.847102900384925e-06
Epoch 18: traing loss: 3.0836142741463846e-06
Epoch 19: traing loss: 2.025299863817054e-06
Epoch 20: traing loss: 1.3710840676139924e-06
Epoch 21: traing loss: 9.552164783599437e-07
Epoch 22: traing loss: 6.8372025907

In [496]:
list(model.parameters())

[Parameter containing:
 tensor([[ 1.4480,  0.8429,  1.4542,  1.3326,  0.9692],
         [-0.7744, -0.2640, -0.6238, -0.7726, -0.6092],
         [ 1.5298,  0.8408, -0.4859,  0.8626,  0.8568]], requires_grad=True),
 Parameter containing:
 tensor([ 1.4120, -0.7140,  1.8116], requires_grad=True),
 Parameter containing:
 tensor([[1.4236, 0.2765, 1.8516],
         [1.5264, 1.1963, 1.1501],
         [1.6054, 0.3067, 1.1381],
         [1.4456, 0.6615, 1.5413],
         [1.5881, 0.3240, 1.9555]], requires_grad=True),
 Parameter containing:
 tensor([1.1975, 1.7264, 1.5122, 1.1803, 0.9093], requires_grad=True)]

In [497]:
y_pred = model(x)
y_pred

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [0.9999, 0.9994, 0.9994, 0.9997, 0.9999],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<SigmoidBackward0>)

In [498]:
x2 = y + torch.randn_like(y)
y_pred = model(x2)
y_pred

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [0.9989, 0.9970, 0.9964, 0.9977, 0.9990],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<SigmoidBackward0>)

In [499]:
# Our raw data, which consists of sentences
corpus = [
    "We always come to Paris",
    "The professor is from Australia",
    "I live in Stanford",
    "He comes from Taiwan",
    "The capital of Turkey is Ankara",
]

In [500]:
def preprocess_sentence(sentence):
    return sentence.lower().split()


train_sentences = [preprocess_sentence(sent) for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

In [501]:
# Set of locations that appear in our corpus
locations = set(["australia", "ankara", "paris", "stanford", "taiwan", "turkey"])

# Our train labels
train_labels = [
    [1 if word in locations else 0 for word in sent] for sent in train_sentences
]
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

In [502]:
voca = set(w for s in train_sentences for w in s)
voca

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [503]:
voca.add("<unk>")

In [504]:
voca.add("<pad>")


def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token] * window_size
    return window + sentence + window


window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [505]:
ix_to_word = sorted(list(voca))

# Creating a dictionary to find the index of a given word
word_to_ix = {word: ind for ind, word in enumerate(ix_to_word)}
word_to_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

In [506]:
# Given a sentence of tokens, return the corresponding indices
def convert_token_to_indices(sentence, word_to_ix):
    indices = []
    for token in sentence:
        # Check if the token is in our vocabularly. If it is, get it's index.
        # If not, get the index for the unknown token.
        if token in word_to_ix:
            index = word_to_ix[token]
        else:
            index = word_to_ix["<unk>"]
        indices.append(index)
    return indices


# More compact version of the same function
def _convert_token_to_indices(sentence, word_to_ix):
    return [word_to_ind.get(token, word_to_ix["<unk>"]) for token in sentence]


# Show an example
example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_to_ix)
restored_example = [ix_to_word[ind] for ind in example_indices]

print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

Original sentence is: ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In [507]:
# Converting our sentences to indices
example_padded_indices = [
    convert_token_to_indices(s, word_to_ix) for s in train_sentences
]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [508]:
embedding_dim = 5
embeds = nn.Embedding(len(voca), embedding_dim)

list(embeds.parameters())

[Parameter containing:
 tensor([[ 9.8413e-01,  1.5484e+00, -1.2672e+00, -1.1082e+00,  8.3360e-01],
         [ 6.9379e-01,  2.0860e+00,  9.0332e-01, -2.3859e-01,  2.1915e-01],
         [ 1.8352e-01, -1.1562e-01, -2.1893e-01,  8.8643e-02,  9.9763e-01],
         [-1.1169e-01, -4.4002e-01, -5.7400e-01, -3.1012e-01, -1.2741e+00],
         [-2.9010e-01, -8.3380e-01, -1.3564e+00,  8.2453e-01, -5.9400e-01],
         [-1.1549e+00,  7.9573e-01, -7.9232e-01,  8.3687e-04,  6.4324e-01],
         [ 6.8193e-01,  1.5795e+00,  5.9015e-01, -9.4179e-01,  6.8130e-01],
         [-1.2376e+00,  1.6334e+00,  9.5616e-01, -5.5822e-01, -9.8740e-02],
         [-7.2223e-01,  1.2875e+00, -1.1724e+00, -2.0820e+00, -8.2918e-02],
         [ 5.3661e-01, -1.1516e+00,  6.9675e-01, -2.2440e+00,  1.4616e+00],
         [-2.7778e-01,  1.4968e+00, -5.9826e-01, -8.7479e-02, -6.6500e-02],
         [-9.6939e-01,  9.7363e-01,  5.6244e-01, -8.8421e-01, -3.5952e-02],
         [-1.2687e+00,  4.9372e-01,  2.2126e+00, -5.9502e-01, -2.

In [509]:
index = word_to_ix["paris"]
index_tensor = torch.tensor(index, dtype=torch.long)
print(index_tensor.data)
paris_embed = embeds(index_tensor)
paris_embed

tensor(15)


tensor([ 1.1325,  1.7271, -1.4420,  0.1453, -1.1364],
       grad_fn=<EmbeddingBackward0>)

In [510]:
def _custom_collate_fn(batch, window_size, word_to_ix):
    # Prepare the datapoints
    x, y = zip(*batch)
    x = [pad_window(s, window_size=window_size) for s in x]
    x = [convert_token_to_indices(s, word_to_ix) for s in x]

    # Pad x so that all the examples in the batch have the same size
    pad_token_ix = word_to_ix["<pad>"]
    x = [torch.LongTensor(x_i).cuda() for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(
        x, batch_first=True, padding_value=pad_token_ix
    )

    # Pad y and record the length
    lengths = [len(label) for label in y]
    lenghts = torch.LongTensor(lengths)
    y = [torch.LongTensor(y_i).cuda() for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

    return x_padded, y_padded, lenghts

In [511]:
from torch.utils.data import DataLoader
from functools import partial


data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate the DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
# Go through one loop
counter = 0
for batched_x, batched_y, batched_lengths in loader:
    print(f"Iteration {counter}")
    print("Batched Input:")
    print(batched_x)
    print("Batched Labels:")
    print(batched_y)
    print("Batched Lengths:")
    print(batched_lengths)
    print("")
    counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0],
        [ 0,  0, 10, 13, 11, 17,  0,  0]], device='cuda:0')
Batched Labels:
tensor([[0, 0, 0, 1],
        [0, 0, 0, 1]], device='cuda:0')
Batched Lengths:
tensor([4, 4])

Iteration 1
Batched Input:
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0,  0],
        [ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]], device='cuda:0')
Batched Labels:
tensor([[0, 0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0, 1]], device='cuda:0')
Batched Lengths:
tensor([5, 6])

Iteration 2
Batched Input:
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0]], device='cuda:0')
Batched Labels:
tensor([[0, 0, 0, 0, 1]], device='cuda:0')
Batched Lengths:
tensor([5])



In [512]:
# Print the original tensor
print(f"Original Tensor: ")
print(batched_x)
print("")

# Create the 2 * 2 + 1 chunks
chunk = batched_x.unfold(1, window_size * 2 + 1, 1)
print(f"Windows: ")
print(chunk)

Original Tensor: 
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0]], device='cuda:0')

Windows: 
tensor([[[ 0,  0, 22,  2,  6],
         [ 0, 22,  2,  6, 20],
         [22,  2,  6, 20, 15],
         [ 2,  6, 20, 15,  0],
         [ 6, 20, 15,  0,  0]]], device='cuda:0')


In [513]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
torch.cuda.get_device_name(0)

True
1
0


'NVIDIA GeForce RTX 3090'

In [514]:
device = torch.device("cuda:0")


class WordWindowClassifier(nn.Module):

    def __init__(self, hyperparameters, vocab_size, pad_ix=0):
        super(WordWindowClassifier, self).__init__()

        """ Instance variables """
        self.window_size = hyperparameters["window_size"]
        self.embed_dim = hyperparameters["embed_dim"]
        self.hidden_dim = hyperparameters["hidden_dim"]
        self.freeze_embeddings = hyperparameters["freeze_embeddings"]

        """ Embedding Layer
    Takes in a tensor containing embedding indices, and returns the
    corresponding embeddings. The output is of dim
    (number_of_indices * embedding_dim).

    If freeze_embeddings is True, set the embedding layer parameters to be
    non-trainable. This is useful if we only want the parameters other than the
    embeddings parameters to change.

    """
        self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_ix)
        if self.freeze_embeddings:
            self.embed_layer.weight.requires_grad = False

        """ Hidden Layer
    """
        full_window_size = 2 * window_size + 1
        self.hidden_layer = nn.Sequential(
            nn.Linear(full_window_size * self.embed_dim, self.hidden_dim), nn.Tanh()
        )

        """ Output Layer
    """
        self.output_layer = nn.Linear(self.hidden_dim, 1)

        """ Probabilities
    """
        self.probabilities = nn.Sigmoid()

    def forward(self, inputs):
        """
        Let B:= batch_size
            L:= window-padded sentence length
            D:= self.embed_dim
            S:= self.window_size
            H:= self.hidden_dim

        inputs: a (B, L) tensor of token indices
        """
        B, L = inputs.size()

        """
    Reshaping.
    Takes in a (B, L) LongTensor
    Outputs a (B, L~, S) LongTensor
    """
        # Fist, get our word windows for each word in our input.
        token_windows = inputs.unfold(1, 2 * self.window_size + 1, 1)
        _, adjusted_length, _ = token_windows.size()

        # Good idea to do internal tensor-size sanity checks, at the least in comments!
        # assert token_windows.size() == (B, adjusted_length, 2 * self.window_size + 1)

        """
    Embedding.
    Takes in a torch.LongTensor of size (B, L~, S)
    Outputs a (B, L~, S, D) FloatTensor.
    """
        embedded_windows = self.embeds(token_windows)

        """
    Reshaping.
    Takes in a (B, L~, S, D) FloatTensor.
    Resizes it into a (B, L~, S*D) FloatTensor.
    -1 argument "infers" what the last dimension should be based on leftover axes.
    """
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)

        """
    Layer 1.
    Takes in a (B, L~, S*D) FloatTensor.
    Resizes it into a (B, L~, H) FloatTensor
    """
        layer_1 = self.hidden_layer(embedded_windows)

        """
    Layer 2
    Takes in a (B, L~, H) FloatTensor.
    Resizes it into a (B, L~, 1) FloatTensor.
    """
        output = self.output_layer(layer_1)

        """
    Softmax.
    Takes in a (B, L~, 1) FloatTensor of unnormalized class scores.
    Outputs a (B, L~, 1) FloatTensor of (log-)normalized class scores.
    """
        output = self.probabilities(output)
        output = output.view(B, -1)

        return output

In [515]:
# Prepare the data
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate a DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Initialize a model
# It is useful to put all the model hyperparameters in a dictionary
model_hyperparameters = {
    "batch_size": 4,
    "window_size": 2,
    "embed_dim": 25,
    "hidden_dim": 25,
    "freeze_embeddings": False,
}

vocab_size = len(word_to_ix)

model = WordWindowClassifier(model_hyperparameters, vocab_size)
model.to(device)

# Define an optimizer
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


# Define a loss function, which computes to binary cross entropy loss
def loss_function(batch_outputs, batch_labels, batch_lengths):
    # Calculate the loss for the whole batch
    bceloss = nn.BCELoss()
    loss = bceloss(batch_outputs, batch_labels.float())

    # Rescale the loss. Remember that we have used lengths to store the
    # number of words in each training example
    loss = loss / batch_lengths.sum().float()

    return loss

In [516]:
# Function that will be called in every epoch
def train_epoch(loss_function, optimizer, model, loader):

    # Keep track of the total loss for the batch
    total_loss = 0
    for batch_inputs, batch_labels, batch_lengths in loader:
        # Clear the gradients
        optimizer.zero_grad()
        # Run a forward pass
        outputs = model.forward(batch_inputs)
        # Compute the batch loss
        loss = loss_function(outputs, batch_labels, batch_lengths)
        # Calculate the gradients
        loss.backward()
        # Update the parameteres
        optimizer.step()
        total_loss += loss.item()

    return total_loss


# Function containing our main training loop
def train(loss_function, optimizer, model, loader, num_epochs=10000):

    # Iterate through each epoch and call our train_epoch function
    for epoch in range(num_epochs):
        epoch_loss = train_epoch(loss_function, optimizer, model, loader)
        if epoch % 100 == 0:
            print(epoch_loss)

In [421]:
num_epochs = 1000
train(loss_function, optimizer, model, loader, num_epochs=num_epochs)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
# Create test sentences
test_corpus = ["She comes from Paris"]
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [[0, 0, 0, 1]]

# Create a test loader
test_data = list(zip(test_sentences, test_labels))
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size=2, word_to_ix=word_to_ix)
test_loader = torch.utils.data.DataLoader(
    test_data, batch_size=1, shuffle=False, collate_fn=collate_fn
)

In [None]:
for test_instance, labels, _ in test_loader:
    outputs = model.forward(test_instance)
    print(labels)
    print(outputs)

tensor([[0, 0, 0, 1]])
tensor([[0.1176, 0.0254, 0.0952, 0.8463]], grad_fn=<ViewBackward0>)
