# Load Images

In [1]:
import numpy as np
import gzip
import struct

def load_images(filename):
    # Open and unzip the file of images:
    with gzip.open(filename, 'rb') as f:
        # Read the header information into a bunch of variables:
        _ignored, n_images, image_columns, image_rows = struct.unpack('>IIII', f.read(16))
        # Read all the pixels into a long NumPy array:
        all_pixels = np.frombuffer(f.read(), dtype=np.uint8)
        # Reshape the array into a matrix where each line is an image:
        images_matrix = all_pixels.reshape(n_images, image_columns * image_rows)
        # Add a bias column full of 1s as the first column in the matrix
        return np.insert(images_matrix, 0, 1, axis=1)

In [3]:
# 60000 images, each 785 elements (1 bias + 28 * 28 pixels):
X_train = load_images("./../data/mnist/train-images-idx3-ubyte.gz")

# 10000 images, each 785 elements, with the same structure as X_train:
X_test = load_images("./../data/mnist/t10k-images-idx3-ubyte.gz")

# Load Labels

In [6]:
def load_labels(filename):
    # Open and unzip the file of images:
    with gzip.open(filename, 'rb') as f:
        # Skip the header bytes:
        f.read(8)
        # Read all the labels into a list:
        all_labels = f.read()
        # Reshape the list of labels into a one-column matrix:
        return np.frombuffer(all_labels, dtype=np.uint8).reshape(-1, 1)

In [7]:
def one_hot_encode(Y):
    NUMBER_OF_CLASSES = 10        # One class per digit
    NUMBER_OF_LABELS = Y.shape[0] # One label for each row in Y
    
    # Prepare a matrix of zeros with as many rows as the rows in Y,
    # and as many columns as the number of classes:
    encoded_labels = np.zeros((NUMBER_OF_LABELS, NUMBER_OF_CLASSES))

    # For each row, flip the column that matches the label to 1:
    for row in range(NUMBER_OF_LABELS):
        label = Y[row]
        encoded_labels[row][label] = 1
        
    return encoded_labels

In [9]:
# 60K labels, each with value 1 if the digit is a five, and 0 otherwise:
original_labels = load_labels("./../data/mnist/train-labels-idx1-ubyte.gz")
Y_train = one_hot_encode(original_labels)

# 10000 labels, with the same encoding as Y_train:
Y_test = load_labels("./../data/mnist/t10k-labels-idx1-ubyte.gz")

# Create the model

In [27]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def train(X, Y, iterations, lr):
    w = np.zeros((X.shape[1], Y.shape[1]))
    for i in range(iterations):
        print("Iteration %4d " % (i))
        w -= np.matmul(X.T, (sigmoid(np.matmul(X, w)) - Y)) / X.shape[0] * lr
    return w

In [36]:
w = train(X_train, Y_train, iterations=200, lr=0.00001)

Iteration    0 
Iteration    1 
Iteration    2 
Iteration    3 
Iteration    4 
Iteration    5 
Iteration    6 
Iteration    7 
Iteration    8 
Iteration    9 
Iteration   10 
Iteration   11 
Iteration   12 
Iteration   13 
Iteration   14 
Iteration   15 
Iteration   16 
Iteration   17 
Iteration   18 
Iteration   19 
Iteration   20 
Iteration   21 
Iteration   22 
Iteration   23 
Iteration   24 
Iteration   25 
Iteration   26 
Iteration   27 
Iteration   28 
Iteration   29 
Iteration   30 
Iteration   31 
Iteration   32 
Iteration   33 
Iteration   34 
Iteration   35 
Iteration   36 
Iteration   37 
Iteration   38 
Iteration   39 
Iteration   40 
Iteration   41 
Iteration   42 
Iteration   43 
Iteration   44 
Iteration   45 
Iteration   46 
Iteration   47 
Iteration   48 
Iteration   49 
Iteration   50 
Iteration   51 
Iteration   52 
Iteration   53 
Iteration   54 
Iteration   55 
Iteration   56 
Iteration   57 
Iteration   58 
Iteration   59 
Iteration   60 
Iteration   61 
Iteratio

In [32]:
def predict(X, w):
    return np.argmax(sigmoid(np.matmul(X, w)), axis=1).reshape(-1, 1)

In [37]:
100 * np.count_nonzero(predict(X_test, w) == Y_test) / Y_test.shape[0]

90.32

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
classes = ["not paraphrase", "is paraphrase"]
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
# The tokekenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to the sequence, as well as compute the attention masks.
paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")
paraphrase_classification_logits = model(**paraphrase).logits
not_paraphrase_classification_logits = model(**not_paraphrase).logits
paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
# Should be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
# Should not be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433M [00:00<?, ?B/s]

not paraphrase: 10%
is paraphrase: 90%
not paraphrase: 94%
is paraphrase: 6%


In [3]:
from transformers import LayoutLMTokenizer, LayoutLMModel
import torch
tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
model = LayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
words = ["Hello", "world"]
normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
token_boxes = []
for word, box in zip(words, normalized_word_boxes):
    word_tokens = tokenizer.tokenize(word)
    token_boxes.extend([box] * len(word_tokens))
# add bounding boxes of cls + sep tokens
token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
encoding = tokenizer(' '.join(words), return_tensors="pt")
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]
token_type_ids = encoding["token_type_ids"]
bbox = torch.tensor([token_boxes])
outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
last_hidden_states = outputs.last_hidden_state

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/170 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/606 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/453M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/layoutlm-base-uncased were not used when initializing LayoutLMModel: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing LayoutLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1377, -0.0217,  0.5168,  ..., -0.3557,  0.0057, -0.0747],
         [ 0.2349, -0.0126,  0.2175,  ...,  0.3546,  0.4783, -0.5063],
         [ 0.1883,  0.0240,  0.5338,  ..., -0.0384,  0.2696, -0.7188],
         [ 0.1364, -0.0190,  0.5194,  ..., -0.3599,  0.0044, -0.0714]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-0.1216,  0.2186, -0.0275, -0.0761,  0.2830,  0.0359, -0.4294, -0.1931,
          0.1522,  0.0605,  0.0462,  0.2257,  0.2945, -0.3746,  0.2547,  0.1569,
          0.4103,  0.1967, -0.0695,  0.4194, -0.3590,  0.0327,  0.0486, -0.2101,
         -0.1370,  0.1006,  0.2010, -0.0674,  0.3664, -0.0214,  0.3499, -0.3179,
         -0.0975,  0.2563, -0.0059, -0.4568, -0.1184, -0.2711,  0.1816,  0.0836,
         -0.1046, -0.2034,  0.3083,  0.3592,  0.0949,  0.2845,  0.2826, -0.1538,
         -0.3005, -0.0784, -0.0088,  0.1496, -0.3902, -0.0309, -0.4107, -0.1594,
         -0.0673, -0.1504, 