# Data pre-processing & tokenization

CS685 Spring 2022 <br />
Apr. 2, 2022<br />
Hongyu Tu <br />

In [1]:
import torch
import pickle
import numpy as np
import torch.nn as nn
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
tmp_lst = []

for i in ['danmu', 'comment']:
    with open('{}_token.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
    with open('{}_dist.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
        
danmu_token, danmu_dist, comment_token, comment_dist = tmp_lst

In [5]:
def tok_ten(token_lst, dist_lst):
    lst = [tokenizer(i)['input_ids'][1:-1] for i in token_lst]
    max_length = np.max(np.array([len(i) for i in lst]))
    tmp_lst = []
    for i in range(len(lst)):
        tmp = np.concatenate([lst[i], -np.ones(1 + max_length - len(lst[i]))]) 
        tmp_lst.append(tmp)
    output = torch.from_numpy(np.array(tmp_lst).astype(np.float32)).to(device=device), \
             torch.from_numpy(np.array(dist_lst).astype(np.float32)).to(device=device)
    return output

In [6]:
danmu_x, danmu_y = tok_ten(danmu_token, danmu_dist)
comment_x, comment_y = tok_ten(comment_token, comment_dist)

In [7]:
from torch.autograd import Variable

In [8]:
X_train, X_test, y_train, y_test = train_test_split(danmu_x, danmu_y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [9]:
X_train.shape[0], X_val.shape[0], X_test.shape[0]
inputDim, outputDim = X_train.shape[1], y_train.shape[1]

(269913, 33739, 33740)

In [31]:
length = len(X_train)
learningRate = 0.1
epochs = 10000

In [32]:
size1, size2 = inputDim, outputDim

model = nn.Sequential(
    nn.Linear(size1, 200),
    nn.LeakyReLU(0.1),
    nn.Linear(200, 200),
    nn.LeakyReLU(0.1),
    nn.Dropout(0.15),
    nn.Linear(200, 100),
    nn.LeakyReLU(0.1),
    nn.Linear(100, size2),
    nn.Softmax(dim=1)
).to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)

In [None]:
model.train()

In [None]:
loss_his = []
for epoch in tqdm(range(epochs)):
    rand_idx = np.random.permutation(length)[:int(length/100)]
    x = X_train[rand_idx]
    y = y_train[rand_idx]
    
    model.train()
    outputs = model(x)
    loss = criterion(outputs, y)
    loss_his.append(loss)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    model.eval()

    
    if epoch % 1000 == 0:
        print('epoch {}, loss {}'.format(epoch, loss.item()))

In [33]:
for epoch in tqdm(range(epochs)):
    rand_idx = np.random.permutation(length)[:int(length/100)]
    inputs = Variable(X_train[rand_idx])
    labels = Variable(y_train[rand_idx])
    
    # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
    optimizer.zero_grad()

    # get output from the model, given the inputs
    outputs = model(inputs)

    # get loss for the predicted output
    loss = criterion(outputs, labels)
    # get gradients w.r.t to parameters
    loss.backward()

    # update parameters
    optimizer.step()
    
    if epoch % 1000 == 0:
        print('epoch {}, loss {}'.format(epoch, loss.item()))

  0%|                                                                              | 14/10000 [00:00<01:14, 134.14it/s]

epoch 0, loss 4.839853286743164


 10%|███████▊                                                                    | 1030/10000 [00:07<01:04, 139.70it/s]

epoch 1000, loss 4.798184871673584


 20%|███████████████▎                                                            | 2019/10000 [00:14<00:57, 138.63it/s]

epoch 2000, loss 4.801120758056641


 30%|██████████████████████▉                                                     | 3026/10000 [00:21<00:49, 140.62it/s]

epoch 3000, loss 4.797853946685791


 40%|██████████████████████████████▌                                             | 4014/10000 [00:28<00:44, 135.99it/s]

epoch 4000, loss 4.799497604370117


 47%|███████████████████████████████████▊                                        | 4714/10000 [00:33<00:37, 139.26it/s]


KeyboardInterrupt: 