In [1]:
import torch

In [2]:
x = torch.arange(12)
x

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [3]:
x.shape

torch.Size([12])

In [4]:
x.numel()

12

In [6]:
X = x.reshape(3, 4)
X

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

In [8]:
torch.zeros((2, 3, 4))

tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])

In [9]:
torch.ones((2, 3, 4))

tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]])

In [12]:
torch.tensor([[[2, 1], [3, 4], [8, 3]]]).shape

torch.Size([1, 3, 2])

In [13]:
torch.rand(3, 4)

tensor([[0.2230, 0.2819, 0.2863, 0.5584],
        [0.9694, 0.0679, 0.6514, 0.6581],
        [0.9321, 0.7068, 0.3062, 0.6984]])

In [14]:
x = torch.tensor([1.0, 2, 4, 8])
y = torch.tensor([2, 2, 2, 2])

x + y, x - y, x * y, x**y

(tensor([ 3.,  4.,  6., 10.]),
 tensor([-1.,  0.,  2.,  6.]),
 tensor([ 2.,  4.,  8., 16.]),
 tensor([ 1.,  4., 16., 64.]))

In [15]:
torch.exp(x)

tensor([2.7183e+00, 7.3891e+00, 5.4598e+01, 2.9810e+03])

In [42]:
x = torch.arange(12, dtype=torch.float32).reshape(3, 4)
y = torch.tensor([[2.0, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]])

torch.cat((x, y), dim=0), torch.cat((x, y), dim=1)

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [ 2.,  1.,  4.,  3.],
         [ 1.,  2.,  3.,  4.],
         [ 4.,  3.,  2.,  1.]]),
 tensor([[ 0.,  1.,  2.,  3.,  2.,  1.,  4.,  3.],
         [ 4.,  5.,  6.,  7.,  1.,  2.,  3.,  4.],
         [ 8.,  9., 10., 11.,  4.,  3.,  2.,  1.]]))

In [43]:
x > y

tensor([[False, False, False, False],
        [ True,  True,  True,  True],
        [ True,  True,  True,  True]])

In [21]:
x.sum()

tensor(66.)

In [44]:
a = torch.arange(3).reshape((1, 3, 1))
b = torch.arange(2).reshape((1, 1, 2))
a + b

tensor([[[0, 1],
         [1, 2],
         [2, 3]]])

In [25]:
x[-1], x[1:3]

(tensor([ 8.,  9., 10., 11.]),
 tensor([[ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]]))

In [27]:
x[1, 2] = 9
x

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  9.,  7.],
        [ 8.,  9., 10., 11.]])

In [29]:
x[0:2, :]=12
x

tensor([[12., 12., 12., 12.],
        [12., 12., 12., 12.],
        [ 8.,  9., 10., 11.]])

#### 减少内存

In [34]:
before = id(y)
y[:] = x + y
id(y) == before

True

In [36]:
z = torch.zeros_like(y)
print(id(z))
z[:] = x + y
print(id(z))

2531343019040
2531343019040


In [32]:
before = id(x)
x += y
id(x) == before

True

#### 类型转化

In [41]:
a = x.numpy()
b = torch.tensor(a)
type(a), type(b)

(numpy.ndarray, torch.Tensor)

In [39]:
a = torch.tensor([3.5])
a, a.item(), float(a), int(a)

(tensor([3.5000]), 3.5, 3.5, 3)

## Pandas

In [45]:
import pandas as pd
import os

In [71]:
os.makedirs(os.path.join('.', 'data'), exist_ok=True)
data_file = os.path.join('.', 'data', 'house_tiny.csv')

with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')
    f.write('NA,Pave,127500\n')  # 每行表示一个数据样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [72]:
data = pd.read_csv(data_file)
print(data, type(data))
data

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000 <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


### 修改缺失值

In [73]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:,2]
# inputs = inputs.fillna(inputs.mean()) 错误
inputs['NumRooms'] = inputs['NumRooms'].fillna(inputs['NumRooms'].mean())
inputs

Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [77]:
inputs = pd.get_dummies(inputs, dummy_na=True)
inputs['Alley_Pave'] = inputs['Alley_Pave'].astype(int)
inputs['Alley_nan'] = inputs['Alley_nan'].astype(int)
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


### 转换为张量格式

In [78]:
type(inputs)

pandas.core.frame.DataFrame

In [80]:
x, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
x, y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

### 小结

* `pandas`软件包是Python中常用的数据分析工具中，`pandas`可以与张量兼容。
* 用`pandas`处理缺失的数据时，我们可根据情况选择用插值法和删除法。

### 练习

创建包含更多行和列的原始数据集。

1. 删除缺失值最多的列。
2. 将预处理后的数据集转换为张量格式。


In [83]:
data_file_1 = os.path.join('.', 'data', 'test_tiny.csv')
with open(data_file_1, 'w') as f:
    f.write('A,B,C,D\n')
    f.write('1,2,NA,4\n')
    f.write('NA,NA,NA,ran\n')
    f.write('1,2,NA,zhao\n')
    f.write('NA,2,3,NA\n')

data = pd.read_csv(data_file_1)
data

Unnamed: 0,A,B,C,D
0,1.0,2.0,,4
1,,,,ran
2,1.0,2.0,,zhao
3,,2.0,3.0,


- `isna()` 和 `isnull()`相同
- `sum(axis=)`，使用时`True`被视为1
- `drop`
- `idxmax()`，返回最大值所在的索引

In [84]:
missing_counts = data.isna().sum()
data_cleaned = data.drop(columns = missing_counts.idxmax())
data_cleaned

Unnamed: 0,A,B,D
0,1.0,2.0,4
1,,,ran
2,1.0,2.0,zhao
3,,2.0,


In [91]:
inputs = data_cleaned.iloc[:, 0:2]
outputs = data_cleaned.iloc[:, 2]

inputs[['A','B']] = inputs[['A','B']].fillna(inputs[['A','B']].mean())
inputs

Unnamed: 0,A,B
0,1.0,2.0
1,1.0,2.0
2,1.0,2.0
3,1.0,2.0


In [93]:
outputs = pd.get_dummies(outputs, dummy_na=True).astype(int)

outputs

Unnamed: 0,4,ran,zhao,NaN
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1


In [94]:
x, y = torch.tensor(inputs.values), torch.tensor(outpus.values)
x, y

(tensor([[1., 2.],
         [1., 2.],
         [1., 2.],
         [1., 2.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

#### 问题
- reshape() 返回的是原张量的**视图**（view）

In [95]:
a = torch.arange(12)
b = a.reshape((3, 4))
b[:] = 2
a

tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## 线性代数

### 标量
只有一个元素的张量biaoshi

In [100]:
x = torch.tensor([3.0])
y = torch.tensor(2.0)
x + y, x * y

(tensor([5.]), tensor([6.]))

### 向量

In [101]:
x = torch.arange(4)
x

tensor([0, 1, 2, 3])

In [102]:
x[3]

tensor(3)

长度

In [103]:
len(x)

4

In [104]:
x.shape

torch.Size([4])

### 矩阵

In [105]:
A = torch.arange(20).reshape(5, 4)
A

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19]])

In [106]:
A.T

tensor([[ 0,  4,  8, 12, 16],
        [ 1,  5,  9, 13, 17],
        [ 2,  6, 10, 14, 18],
        [ 3,  7, 11, 15, 19]])

In [107]:
B = torch.tensor([[1, 2, 3], [2, 0, 4], [3, 4, 5]])
B == B.T

tensor([[True, True, True],
        [True, True, True],
        [True, True, True]])

### 张量

In [109]:
X = torch.arange(24).reshape(2, 3, 4)
X

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])

In [110]:
A = torch.arange(20, dtype=torch.float32).reshape(5, 4)
B = A.clone()
A, A + B

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [12., 13., 14., 15.],
         [16., 17., 18., 19.]]),
 tensor([[ 0.,  2.,  4.,  6.],
         [ 8., 10., 12., 14.],
         [16., 18., 20., 22.],
         [24., 26., 28., 30.],
         [32., 34., 36., 38.]]))

**哈达玛积**：按元素乘法

In [111]:
A * B

tensor([[  0.,   1.,   4.,   9.],
        [ 16.,  25.,  36.,  49.],
        [ 64.,  81., 100., 121.],
        [144., 169., 196., 225.],
        [256., 289., 324., 361.]])

In [112]:
a = 2
X = torch.arange(24).reshape(2, 3, 4)
a + X, (a * X).shape

(tensor([[[ 2,  3,  4,  5],
          [ 6,  7,  8,  9],
          [10, 11, 12, 13]],
 
         [[14, 15, 16, 17],
          [18, 19, 20, 21],
          [22, 23, 24, 25]]]),
 torch.Size([2, 3, 4]))

### 降维

In [113]:
x = torch.arange(4, dtype=torch.float32)
x, x.sum()

(tensor([0., 1., 2., 3.]), tensor(6.))

In [114]:
A.shape, A.sum()

(torch.Size([5, 4]), tensor(190.))

In [115]:
A_sum_axis0 = A.sum(axis=0)
A_sum_axis0, A_sum_axis0.shape

(tensor([40., 45., 50., 55.]), torch.Size([4]))

In [116]:
A_sum_axis1 = A.sum(axis=1)
A_sum_axis1, A_sum_axis1.shape

(tensor([ 6., 22., 38., 54., 70.]), torch.Size([5]))

沿着行和列对矩阵求和，等价于对矩阵的所有元素进行求和

In [117]:
A.sum(axis=[0, 1])

tensor(190.)

#### 求平均

In [118]:
A.mean(), A.sum() / A.numel()

(tensor(9.5000), tensor(9.5000))

In [120]:
A.mean(axis=0), A.sum(axis=0) / A.shape[0]

(tensor([ 8.,  9., 10., 11.]), tensor([ 8.,  9., 10., 11.]))

In [121]:
A.mean(axis=1), A.sum(axis=1) / A.shape[1]

(tensor([ 1.5000,  5.5000,  9.5000, 13.5000, 17.5000]),
 tensor([ 1.5000,  5.5000,  9.5000, 13.5000, 17.5000]))

### 非降维求和

计算总和或均值时保持**轴数不变**，可以**广播**机制

In [123]:
sum_A = A.sum(axis=1, keepdims=True)
sum_A

tensor([[ 6.],
        [22.],
        [38.],
        [54.],
        [70.]])

In [124]:
A / sum_A

tensor([[0.0000, 0.1667, 0.3333, 0.5000],
        [0.1818, 0.2273, 0.2727, 0.3182],
        [0.2105, 0.2368, 0.2632, 0.2895],
        [0.2222, 0.2407, 0.2593, 0.2778],
        [0.2286, 0.2429, 0.2571, 0.2714]])

In [125]:
A.cumsum(axis=0)

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  6.,  8., 10.],
        [12., 15., 18., 21.],
        [24., 28., 32., 36.],
        [40., 45., 50., 55.]])

In [128]:
y = torch.ones(4, dtype=torch.float32)
x, y, torch.dot(x, y)

(tensor([0., 1., 2., 3.]), tensor([1., 1., 1., 1.]), tensor(6.))

In [129]:
torch.sum(x*y)

tensor(6.)

### 矩阵-向量积——`mv`

In [131]:
A.shape, x.shape, torch.mv(A, x)

(torch.Size([5, 4]), torch.Size([4]), tensor([ 14.,  38.,  62.,  86., 110.]))

### 矩阵-向量积——`mm`

In [132]:
B = torch.ones(4, 3)
torch.mm(A, B)

tensor([[ 6.,  6.,  6.],
        [22., 22., 22.],
        [38., 38., 38.],
        [54., 54., 54.],
        [70., 70., 70.]])

### 范数

**L2**范数

In [135]:
u = torch.tensor([-3, 4.])
torch.norm(u)

tensor(5.)

**L1**范数，绝对值之和

In [137]:
torch.sum(torch.abs(u))
torch.abs(u).sum()

tensor(7.)

Forbenius norm —— 对于矩阵而言也是用.norm

In [165]:
torch.norm(torch.ones(4, 9))

tensor(6.)

### 按特定的轴求和

In [139]:
a = torch.ones((2, 5, 4))
a.shape

torch.Size([2, 5, 4])

In [140]:
a.sum().shape

torch.Size([])

In [144]:
a.sum(axis=1), a.sum(axis=1).shape

(tensor([[5., 5., 5., 5.],
         [5., 5., 5., 5.]]),
 torch.Size([2, 4]))

In [145]:
a.sum(axis=[0, 2]), a.sum(axis=[0, 2]).shape

(tensor([8., 8., 8., 8., 8.]), torch.Size([5]))

In [149]:
a.sum(axis=1, keepdims=True).shape

torch.Size([2, 1, 4])

len()函数只返回第一个轴的长度

In [153]:
a.shape, len(a)

(torch.Size([2, 5, 4]), 2)

In [164]:
import numpy as np
A = torch.ones((2, 3, 4))
torch.linalg.norm(A), np.linalg.norm(A)

(tensor(4.8990), 4.8989797)