# Ch01. Context Understanding

### 단순 이미지 object detection 가 아니라 사진 전반적인 정보를 바탕으로 Scene Understanding을 하는 방법 = local feature가 같더라도 global context에 따라 scene의 의미가 달라질 수 있다.

<img src= "https://cobslab.com/wp-content/uploads/2022/06/KakaoTalk_20220610_192243837_11-980x463.png">

# Transformer architecture

<img src ="https://anyline.com/app/uploads/2022/01/figure_1.jpg.webp">

<img src = "https://i.imgur.com/Rk5wkBQ.png">

# Transformer 실습

## SimpleAttention

### - Attention Block
### - MultiHeadAttention
### - Encoder Layer
### - Pytorch Official Implementation

# 1. Attention Block

In [2]:
import torch

In [3]:
device = torch.device('cpu')
#device = torch.device('cuda')

<img src = "https://github.com/dukong1/ComputerVision_latest_paper/blob/main/Ch01.%20Context_Understanding/20230412_112319.png?raw=true">

In [4]:
X = torch.Tensor(torch.randn(8,3)).to(device)

In [5]:
print(X)

tensor([[-0.5320,  0.4759, -0.2298],
        [-0.5578, -0.1919, -1.0992],
        [ 0.0023,  0.8613, -1.6658],
        [-0.4668, -0.6010,  0.3714],
        [-0.0131,  0.4123, -1.3400],
        [-0.4208,  0.6782, -1.5569],
        [-0.1636, -1.5453, -1.5914],
        [ 0.2377, -0.6397,  1.0430]])


In [6]:
W_Q = torch.nn.Parameter(torch.Tensor(torch.randn(3,2))).to(device)
W_K = torch.nn.Parameter(torch.Tensor(torch.randn(3,2))).to(device)
W_V = torch.nn.Parameter(torch.Tensor(torch.randn(3,2))).to(device)

In [7]:
print(W_Q)
print(W_K)
print(W_V)

Parameter containing:
tensor([[-0.9201,  0.2973],
        [ 0.9331, -1.8127],
        [-1.3428,  0.4633]], requires_grad=True)
Parameter containing:
tensor([[ 0.9912, -0.8361],
        [-0.3777,  0.3517],
        [-0.9956, -0.4792]], requires_grad=True)
Parameter containing:
tensor([[-0.1911, -1.3999],
        [ 0.8213,  0.7084],
        [ 1.7573,  0.6895]], requires_grad=True)


In [12]:
Q = torch.matmul(X,W_Q) 
K = torch.mm(X,W_K)
V = X@W_V

In [13]:
print(Q) #(8,2)
print(K) #(8,2)
print(V) #(8,2)

tensor([[ 1.2422, -1.1273],
        [ 1.8102, -0.3273],
        [ 3.0384, -2.3324],
        [-0.6300,  1.1227],
        [ 2.1961, -1.3721],
        [ 3.1105, -2.0757],
        [ 0.8456,  2.0153],
        [-2.2161,  1.7135]], grad_fn=<MmBackward0>)
tensor([[-4.7822e-01,  7.2236e-01],
        [ 6.1387e-01,  9.2565e-01],
        [ 1.3355e+00,  1.0992e+00],
        [-6.0549e-01,  9.4737e-04],
        [ 1.1654e+00,  7.9807e-01],
        [ 8.7680e-01,  1.3364e+00],
        [ 2.0058e+00,  3.5585e-01],
        [-5.6115e-01, -9.2353e-01]], grad_fn=<MmBackward0>)
tensor([[ 8.8615e-02,  9.2343e-01],
        [-1.9825e+00, -1.1281e-01],
        [-2.2203e+00, -5.4159e-01],
        [ 2.4837e-01,  4.8382e-01],
        [-2.0137e+00, -6.1348e-01],
        [-2.0985e+00, -3.9164e-03],
        [-4.0344e+00, -1.9628e+00],
        [ 1.2620e+00, -6.6873e-02]], grad_fn=<MmBackward0>)


### Query, Key 매칭

<img src = "https://github.com/dukong1/ComputerVision_latest_paper/blob/main/Ch01.%20Context_Understanding/20230412_113633.png?raw=true">

In [32]:
attention_score = Q.matmul(K.T)
print("K.shape", K.shape)
print("K.T.shape", K.T.shape)

K.shape torch.Size([8, 2])
K.T.shape torch.Size([2, 8])


In [17]:
attention_score #(8,8)

tensor([[-1.4084, -0.2809,  0.4198, -0.7532,  0.5480, -0.4173,  2.0905,  0.3440],
        [-1.1021,  0.8083,  2.0577, -1.0964,  1.8485,  1.1498,  3.5145, -0.7135],
        [-3.1379, -0.2939,  1.4938, -1.8419,  1.6795, -0.4529,  5.2644,  0.4491],
        [ 1.1123,  0.6525,  0.3927,  0.3825,  0.1618,  0.9480, -0.8642, -0.6833],
        [-2.0414,  0.0781,  1.4247, -1.3310,  1.4644,  0.0920,  3.9168,  0.0348],
        [-2.9870, -0.0120,  1.8724, -1.8854,  1.9685, -0.0466,  5.5004,  0.1715],
        [ 1.0514,  2.3845,  3.3444, -0.5101,  2.5938,  3.4345,  2.4132, -2.3356],
        [ 2.2976,  0.2257, -1.0761,  1.3435, -1.2153,  0.3467, -3.8354, -0.3389]],
       grad_fn=<MmBackward0>)

In [18]:
d_k = K.shape[1]
print(d_k)

2


In [19]:
attention_score = attention_score / (d_k**0.5)
# attention_score = attention_score / (math.sqrt(d_k)) 위와 동일 수식

In [20]:
attention_score = torch.softmax(attention_score, dim=1)

In [21]:
attention_score[0,:]

tensor([0.0336, 0.0745, 0.1223, 0.0534, 0.1339, 0.0677, 0.3986, 0.1159],
       grad_fn=<SliceBackward0>)

In [25]:
attention_score[0,:].sum()

tensor(1., grad_fn=<SumBackward0>)

In [26]:
attention_score.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
       grad_fn=<SumBackward1>)

#### 연산을 줄이기 위해 Q에 먼저 나눠서 계산함

#### d_k = K.shape[1]
#### print(d_k)

#### attention_score = (Q / (d_k**0.5)).matmul(K.T)

In [28]:
Z = attention_score@V #(8,2)

In [30]:
Z.shape

torch.Size([8, 2])

# 2. MultiHeadAttention

<img src = "https://github.com/dukong1/ComputerVision_latest_paper/blob/main/Ch01.%20Context_Understanding/20230412_122155.png?raw=true">