<a href="https://colab.research.google.com/github/cyberone1812/CMUDeepLearning/blob/main/clean_dele_week9_cmu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Week 9 CMU Deep Learning

- [Example 1](https://sebastianraschka.com/blog/2023/self-attention-from-scratch.html)

In [None]:
pip install datasets

Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


In [None]:
import torch
import torch.nn as nn

In [None]:
#attention
sentence = "Alice went down the rabbit hole."

In [None]:
sentence.split()

['Alice', 'went', 'down', 'the', 'rabbit', 'hole.']

In [None]:
dc = {w:i for i, w in enumerate(sorted(sentence.split()))}

In [None]:
dc

{'Alice': 0, 'down': 1, 'hole.': 2, 'rabbit': 3, 'the': 4, 'went': 5}

In [None]:
dc['Alice']

0

In [None]:
[dc[i] for i in sentence.split()]

[0, 5, 1, 4, 3, 2]

In [None]:
num_sentence = torch.tensor([dc[i] for i in sentence.split()])

In [None]:
num_sentence

tensor([0, 5, 1, 4, 3, 2])

In [None]:
embedding_layer = nn.Embedding(len(num_sentence), 5)

In [None]:
embedding_layer(num_sentence)

tensor([[ 0.6941,  0.2592,  1.4015, -0.8204, -0.7528],
        [-0.2364,  0.6361, -1.5011, -0.6235, -0.4114],
        [-0.3128,  0.4857,  0.3493, -0.7419, -1.3527],
        [ 0.4443, -0.4584,  1.7657,  0.7634,  0.4649],
        [-0.6630, -0.3596, -1.5551, -0.6986,  1.7498],
        [-0.6778, -0.1350, -2.2429,  1.2849,  0.2349]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
embedded_sentence = embedding_layer(num_sentence)

In [None]:
d_q, d_k, d_v = 4, 4, 6

In [None]:
d = 5 #num columns in embeddings

In [None]:
Wq = nn.Parameter(torch.rand(d_q, d))
Wk = nn.Parameter(torch.rand(d_k, d))
Wv = nn.Parameter(torch.rand(d_v, d))

In [None]:
x_1 = embedded_sentence[0]

In [None]:
x_1

tensor([ 0.6941,  0.2592,  1.4015, -0.8204, -0.7528],
       grad_fn=<SelectBackward0>)

In [None]:
Wq.matmul(x_1)

tensor([0.3621, 0.6564, 0.5727, 0.7489], grad_fn=<MvBackward0>)

In [None]:
keys = Wk.matmul(embedded_sentence.T).T
values = Wv.matmul(embedded_sentence.T).T

In [None]:
keys.shape

torch.Size([6, 4])

In [None]:
values.shape

torch.Size([6, 6])

In [None]:
#apply to one input vector
q1 = Wq.matmul(x_1)
k1 = Wk.matmul(x_1)
v1 = Wv.matmul(x_1)

In [None]:
#apply to all inputs
omega_1 = q1.matmul(keys.T)

In [None]:
omega_1

tensor([ 1.0153, -2.4124, -1.9498,  3.4301, -2.0831, -1.6585],
       grad_fn=<SqueezeBackward4>)

Attention scores:

$$\alpha_{1, i} = \text{softmax}\big(\frac{w_{1, i}}{\sqrt{d_k}})$$

In [None]:
softmax = nn.Softmax(dim = 0)

In [None]:
attention_weights = softmax(omega_1/torch.sqrt(torch.tensor(d_k)))

In [None]:
attention_weights.shape

torch.Size([6])

In [None]:
context_vector = attention_weights.matmul(values)

In [None]:
context_vector.shape

torch.Size([6])

#### Huggingface

In [None]:
from transformers import pipeline

In [None]:
#transformer
speech_to_text = pipeline(task = "automatic-speech-recognition")

In [None]:
data = 'https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac'

In [None]:
#pretrained
speech_to_text(data)

In [None]:
#huggingface or spacy
data = 'https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac'

In [None]:
speech_to_text(data)

In [None]:
from IPython.display import Audio

In [None]:
Audio(data)

In [None]:
dataset = load_dataset(
    "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
)
dataset

In [None]:
dataset[0]['audio']

In [None]:
Audio(dataset[0]['audio']['array'], rate = dataset[0]['audio']['sampling_rate'])

In [None]:
pipe2 = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-100h")

In [None]:
speech_to_text(dataset[1]['audio'])

In [None]:
pipe2(dataset[1]['audio'])