In [None]:
import torch

In [None]:
### tensor can be defined by three attributes - dimensions, order, dtype

ex = torch.randn(3, 2)
ex.shape, ex.ndim, ex.dtype

In [None]:
### order is number of axes

# scalar is 0
x_scalar = torch.tensor(3.0)

# vector is 1
x_vector = torch.tensor([3.0, 4.0])

# matrix is 2
x_matrix = torch.tensor([[1.0, 2.0], [3.0, 4.0]])

# tensor is n
x_tensor = torch.rand(2, 3, 1, 2, 3)

print(x_scalar.ndim, x_vector.ndim, x_matrix.ndim, x_tensor.ndim)

In [None]:
### properties - element wise operations keep same shape

x = torch.randn(3, 4)
y = torch.randn(3, 4)

(x+y).shape, (x/y).shape

In [None]:
# tensor * scalar is same shape as tensor

(x * 4).shape

In [None]:
### reductions vs non-reductions
x = torch.arange(12, dtype=torch.float32).reshape((3, 4))
print(x)

# when you define axis, you drop that axis
x.sum(), x.mean(), x.sum(axis=1)

In [None]:
# keepdim for broadcasting; now colums sum up to 1

x / x.sum(axis=1, keepdim=True)

In [None]:
# element wise product (hadamard) vs. dot product vs mat mul


# element wise product
print('element wise product')
a = torch.tensor([1, 2, 3])
b = torch.tensor([2, 2, 2])

print(a * b)
print('\n----------------')

print('dot product\n')

# dot product
print(torch.dot(a, b) == (a*b).sum())

print('\n----------------')
print('mat mul')

# matmul is dot product between rows of X and columns of Y
X = torch.randn((3, 4))
Y = torch.randn((4, 5))

x_rows, x_cols = X.shape
y_rows, y_cols = Y.shape

Z = torch.zeros((x_rows, y_cols))
for i in range(x_rows):
    for j in range(y_cols):
            Z[i, j] = torch.dot(X[i, :], Y[:, j])

Z2 = X @ Y

print(torch.allclose(Z, Z2, atol=1e-6))

Z, Z2

In [None]:
# element wise product
a = torch.tensor([1, 2, 3])
b = torch.tensor([2, 2, 2])
a * b

In [None]:
# dot product
torch.dot(a, b) == (a*b).sum()

In [None]:
# matmul is dot product between rows of X and columns of Y
X = torch.randn((3, 4))
Y = torch.randn((4, 5))

x_rows, x_cols = X.shape
y_rows, y_cols = Y.shape

Z = torch.zeros((x_rows, y_cols))
for i in range(x_rows):
    for j in range(y_cols):
            Z[i, j] = torch.dot(X[i, :], Y[:, j])

Z2 = X @ Y

print(torch.allclose(Z, Z2, atol=1e-6))

Z, Z2

In [None]:
# dot product as linear transformation

A = torch.ones(2, 3) # (2,3)
v = torch.tensor([1, 1, 1], dtype=torch.float32) # (3,)

print(A.shape, v.shape)

A @ v # (2,3) @ (3,) -> (2,)

In [None]:
# norms
dummy = torch.tensor([3, 4], dtype=torch.float32)

# p = 1 is manhattan, p = 2 is euclidean
torch.norm(dummy, p=1), torch.norm(dummy, p=2)

In [None]:
### l1 and l2
torch.abs(dummy).sum(), (dummy ** 2).sum() ** 0.5

In [None]:
# frob norm
dummy_matrix = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float32)
print(dummy_matrix.shape)

torch.norm(dummy_matrix, p=2), (dummy_matrix ** 2).sum() ** 0.5

### 2.3.13 exercises

In [None]:
# transpose of transpose
a = torch.randn((4, 5))

# transpose of transpose is the original matrix
a.T.T == a

In [None]:
# transpose properties
a = torch.randn((4, 5))
b = torch.randn((4, 5))

(a.T + b.T) == (a + b).T

In [None]:
# symmmetric
A = torch.randn((2, 2))

(A + A.T) == (A + A.T).T

In [None]:
A + A.T

In [None]:
A = torch.tensor([[1, 2, 3], [2, 0, 4], [3, 4, 5]])
A == A.T

A, A.T

In [None]:
X = torch.randn(2, 3, 4)
Y = torch.randn(3, 9)

# len is dimensions of leading dim
len(X), len(Y)

In [None]:
# reduction division - fails because broadcasting is wrong

A = torch.randn(2, 3)

A / A.sum(axis=1)

In [None]:
# reduction division

A = torch.randn(2, 3)

A / A.sum(axis=1, keepdim=True)

In [None]:
X = (torch.arange(24, dtype=torch.float32) + 1 ) * 2 - 5
X = X.reshape((2, 3, 4))
X

# 3x4, 2x4, 2x3
X.sum(axis=0).shape, X.sum(axis=1).shape, X.sum(axis=2).shape

In [None]:
# frobenius norm
torch.norm(X)

In [None]:
A = torch.randn(2**10, 2**16)
B = torch.randn(2**16, 2**5)
C = torch.randn(2**5, 2**14)

# is there a difference btween AB * C or A * BC?

In [None]:
%timeit 100

res = (A@B) @ C

In [None]:
%timeit 100

res = A @ (B@C)

In [None]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")

In [None]:
import torch
import torch.mps
import time

# Assuming 'mps_device' is defined as your MPS device
mps_device = torch.device('mps')

# Initialize large matrices with Gaussian random variables
A = torch.randn(2**10, 2**16, device=mps_device)
B = torch.randn(2**16, 2**5, device=mps_device)
C = torch.randn(2**5, 2**14, device=mps_device)

# Function to measure the current allocated memory
def current_memory():
    return torch.mps.current_allocated_memory()

# Function to perform matrix multiplication and measure time and memory
def multiply_and_measure(A, B, C):
    start_time = time.time()
    memory_before = current_memory()
    AB = torch.matmul(A, B)
    ABC = torch.matmul(AB, C)
    memory_after = current_memory()
    end_time = time.time()
    time_taken = end_time - start_time
    memory_used = memory_after - memory_before
    return time_taken, memory_used

# Number of iterations for averaging
num_iterations = 100

# Start MPS profiler for performance tracing
torch.mps.profiler.start()

# Compute (A B) C multiple times
total_time_AB_C = 0
total_memory_AB_C = 0
for _ in range(num_iterations):
    time_taken, memory_used = multiply_and_measure(A, B, C)
    total_time_AB_C += time_taken
    total_memory_AB_C += memory_used

# Compute A (B C) multiple times
total_time_A_BC = 0
total_memory_A_BC = 0
for _ in range(num_iterations):
    time_taken, memory_used = multiply_and_measure(A, B, C)
    total_time_A_BC += time_taken
    total_memory_A_BC += memory_used

# Stop MPS profiler after performance tracing
torch.mps.profiler.stop()

# Calculate and print the average timing and memory results
average_time_AB_C = total_time_AB_C / num_iterations
average_memory_AB_C = total_memory_AB_C / num_iterations
average_time_A_BC = total_time_A_BC / num_iterations
average_memory_A_BC = total_memory_A_BC / num_iterations

print(f"Average time for (AB)C: {average_time_AB_C} seconds")
print(f"Average memory for (AB)C: {average_memory_AB_C} bytes")
print(f"Average time for A(BC): {average_time_A_BC} seconds")
print(f"Average memory for A(BC): {average_memory_A_BC} bytes")

# # Empty cache if needed
# torch.mps.empty_cache()


In [None]:
# Number of scalar multiplications for each matrix multiplication
# Calculations for BC
calculations_BC = 2**16 * 2**5 * 2**14

# Calculations for A(BC)
calculations_A_BC = 2**10 * 2**5 * 2**14

# Total calculations for A(BC)
total_calculations_A_BC = calculations_BC + calculations_A_BC
total_calculations_A_BC


In [None]:
import torch
import torch.mps
import time

# Assuming 'mps_device' is defined as your MPS device
mps_device = torch.device('mps')

# Initialize large matrices with Gaussian random variables
A = torch.randn(2**10, 2**16, device=mps_device)
B = torch.randn(2**16, 2**5, device=mps_device)
C = torch.randn(2**5, 2**14, device=mps_device)

# Function to measure the current allocated memory
def current_memory():
    return torch.mps.current_allocated_memory()

# Function to perform matrix multiplication and measure time and memory
def multiply_and_measure(A, B, C, operation_order):
    start_time = time.time()
    memory_before = current_memory()
    
    if operation_order == "A_BC":
        BC = torch.matmul(B, C)
        result = torch.matmul(A, BC)
    elif operation_order == "AB_C":
        AB = torch.matmul(A, B)
        result = torch.matmul(AB, C)
        
    memory_after = current_memory()
    end_time = time.time()
    
    time_taken = end_time - start_time
    memory_used = memory_after - memory_before
    
    return time_taken, memory_used

# Number of iterations for averaging
num_iterations = 100

# Start MPS profiler for performance tracing
torch.mps.profiler.start()

# Compute A (B C) multiple times
total_time_A_BC = 0
total_memory_A_BC = 0
for _ in range(num_iterations):
    time_taken, memory_used = multiply_and_measure(A, B, C, "A_BC")
    total_time_A_BC += time_taken
    total_memory_A_BC += memory_used

# Compute (A B) C multiple times
total_time_AB_C = 0
total_memory_AB_C = 0
for _ in range(num_iterations):
    time_taken, memory_used = multiply_and_measure(A, B, C, "AB_C")
    total_time_AB_C += time_taken
    total_memory_AB_C += memory_used

# Stop MPS profiler after performance tracing
torch.mps.profiler.stop()

# Calculate and print the average timing and memory results
average_time_A_BC = total_time_A_BC / num_iterations
average_memory_A_BC = total_memory_A_BC / num_iterations
average_time_AB_C = total_time_AB_C / num_iterations
average_memory_AB_C = total_memory_AB_C / num_iterations

print(f"Average time for A(BC): {average_time_A_BC} seconds")
print(f"Average memory for A(BC): {average_memory_A_BC} bytes")
print(f"Average time for (AB)C: {average_time_AB_C} seconds")
print(f"Average memory for (AB)C: {average_memory_AB_C} bytes")


In [None]:
# Initialize large matrices with Gaussian random variables
A = torch.randn(2**10, 2**16, device=mps_device)
B = torch.randn(2**16, 2**5, device=mps_device)
C = torch.randn(2**5, 2**14, device=mps_device)

### (AB)C
# Calculations for AB
calculations_AB = 2**10 * 2**16 * 2**5

# Calculations for (AB)C
calculations_AB_C = 2**10 * 2**5 * 2**14

# Total calculations for A(BC)
total_calculations_AB_C = calculations_AB + calculations_AB_C
total_calculations_AB_C

### A(BC)
# Number of scalar multiplications for each matrix multiplication
# Calculations for BC
calculations_BC = 2**16 * 2**5 * 2**14

# Calculations for A(BC)
calculations_A_BC = 2**10 * 2**5 * 2**14

# Total calculations for A(BC)
total_calculations_A_BC = calculations_BC + calculations_A_BC
total_calculations_A_BC

print(f'AB_C vs. A_BC')
total_calculations_AB_C / 1000000000, total_calculations_A_BC / 1000000000

In [None]:
torch.mps.empty_cache()

In [None]:
A, B, C = torch.randn(100, 200), torch.randn(100, 200), torch.randn(100, 200)

A.shape, B.shape, C.shape

In [None]:
# stacked
stacked_abc = torch.stack([A, B, C])
print(stacked_abc.shape)

# or add as three dim
torch.stack([A, B, C], dim=2).shape

In [None]:
# pull out B
torch.stack([A, B, C], dim=2)[:, :, 1] == B