# Data pre-processing & tokenization

CS685 Spring 2022 <br />
Apr. 2, 2022<br />
Hongyu Tu <br />

In [1]:
import torch
import pickle
import numpy as np
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
tmp_lst = []

for i in ['danmu', 'comment']:
    with open('{}_token.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
    with open('{}_dist.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
        
danmu_token, danmu_dist, comment_token, comment_dist = tmp_lst

In [5]:
def tok_ten(token_lst, dist_lst):
    lst = [tokenizer(i)['input_ids'][1:-1] for i in token_lst]
    max_length = np.max(np.array([len(i) for i in lst]))
    tmp_lst = []
    for i in range(len(lst)):
        tmp = np.concatenate([lst[i], -np.ones(1 + max_length - len(lst[i]))]) 
        tmp_lst.append(tmp)
    output = torch.from_numpy(np.array(tmp_lst).astype(np.float32)).to(device=device), \
             torch.from_numpy(np.array(dist_lst).astype(np.float32)).to(device=device)
    return output

In [6]:
danmu_x, danmu_y = tok_ten(danmu_token, danmu_dist)
comment_x, comment_y = tok_ten(comment_token, comment_dist)

In [7]:
from torch.autograd import Variable

In [69]:
X_train, X_test, y_train, y_test = train_test_split(danmu_x, danmu_y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [70]:
X_train.shape[0], X_val.shape[0], X_test.shape[0]
inputDim, outputDim = X_train.shape[1], y_train.shape[1]

In [83]:
def argmax(x):
    return torch.argmax(x, dim=1)

In [84]:
size1, size2 = inputDim, outputDim
L1, L2, L3 = 150, 250, 250

model = nn.Sequential(
    
    nn.Linear(size1, L1),
    nn.LeakyReLU(0.1),
    nn.Linear(L1, L2),
    nn.LeakyReLU(0.1),
    nn.Linear(L2, L3),
    nn.LeakyReLU(0.1),
    nn.Linear(L3, size2),
    nn.LogSoftmax(dim=1)
    argmax()
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

SyntaxError: invalid syntax (Temp/ipykernel_17572/3510429219.py, line 14)

In [78]:
length = len(X_train)
learningRate = 1e-8
epochs = 10000

In [80]:
loss_his = []
for epoch in tqdm(range(epochs)):
    rand_idx = np.random.permutation(length)[:int(length/10)]
    x = X_train[rand_idx]
    y = y_train[rand_idx]
    
    model.train()
    outputs = model(x)
    loss = criterion(outputs, y)
    loss_his.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    model.eval()
    outputs = model(X_val)
    loss = criterion(outputs, y_val)
    
    if epoch % int(epochs/10) == 0:
        print('epoch {}, loss {}'.format(epoch, loss.item()))

  0%|                                                                                | 9/10000 [00:00<01:51, 89.68it/s]

epoch 0, loss 82.76221466064453


 10%|███████▋                                                                    | 1017/10000 [00:09<01:20, 111.70it/s]

epoch 1000, loss 3.9954311847686768


 20%|███████████████▍                                                            | 2025/10000 [00:18<01:10, 112.41it/s]

epoch 2000, loss 3.9777848720550537


 30%|██████████████████████▉                                                     | 3019/10000 [00:27<01:03, 109.47it/s]

epoch 3000, loss 3.975257396697998


 40%|██████████████████████████████▌                                             | 4014/10000 [00:35<00:53, 112.02it/s]

epoch 4000, loss 3.985334873199463


 50%|██████████████████████████████████████▏                                     | 5022/10000 [00:44<00:45, 109.77it/s]

epoch 5000, loss 4.00213623046875


 60%|█████████████████████████████████████████████▋                              | 6018/10000 [00:53<00:35, 111.04it/s]

epoch 6000, loss 4.0153093338012695


 70%|█████████████████████████████████████████████████████▎                      | 7012/10000 [01:02<00:26, 111.56it/s]

epoch 7000, loss 4.045955181121826


 80%|████████████████████████████████████████████████████████████▉               | 8020/10000 [01:11<00:17, 111.57it/s]

epoch 8000, loss 4.067865371704102


 90%|████████████████████████████████████████████████████████████████████▌       | 9016/10000 [01:20<00:08, 112.03it/s]

epoch 9000, loss 4.109577178955078


100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:29<00:00, 112.10it/s]


In [61]:
outputs = model(X_train)

In [67]:
torch.argsort(outputs[5])

tensor([123, 105, 118, 102,  34, 104,  52,   8, 113,  70, 108,  82,  43, 100,
         27,   7,  64,  18,  99, 101,  76,   9,  12, 122,  58, 120, 103, 119,
         89,  95, 124, 125,  94, 116,   0, 121,  33,  91,  42,  17,  31,  21,
         85,  10,  49,  87,  22,  32,  16,  15,  11, 115, 114,   2,  28, 112,
         55,  74,  68,  62,  29,  67,  30,  86,  92,  57,  51,  40,   1,  19,
         84,  97,  50,  23,  90,  56,  41,  61,   4,  88,  60,  66,  48,  54,
        110, 117,  63, 106,  83,  59,  20,  98,   5,  93,   3,  39,  96,  24,
          6,  73,  25,  69,  38,  79,  47,  75,  45, 111,  13,  80,  14,  72,
         26,  81,  77,  36,  53,  37,  46,  44,  65,  71, 109, 107,  78,  35],
       device='cuda:0')

In [68]:
torch.argsort(y_train[5])

tensor([124, 125, 122, 123, 121, 120, 118, 119, 115, 114, 112, 113, 117, 116,
        110, 111, 103, 102, 100, 101,  97,  96,  98,  99, 107, 106, 104, 105,
        109, 108,  94,  95,  79,  78,  76,  77,  73,  72,  74,  75,  67,  66,
         64,  65,  69,  68,  70,  71,  87,  86,  84,  85,  81,  80,  82,  83,
         91,  90,  88,  89,  93,  92,  62,  63,  31,  30,  28,  29,  25,  24,
         26,  27,  19,  18,  16,  17,  21,  20,  22,  23,   7,   6,   4,   5,
          1,   0,   2,   3,  11,  10,   8,   9,  13,  12,  14,  15,  55,  47,
         44,  46,  41,  40,  42,  43,  35,  34,  32,  33,  37,  36,  38,  39,
         59,  54,  52,  53,  49,  48,  50,  51,  61,  58,  56,  57,  60,  45],
       device='cuda:0')

In [83]:
plt.plot(list(range(len(loss_his))), loss_his)
plt.show()