# Data pre-processing & tokenization

CS685 Spring 2022 <br />
Apr. 2, 2022<br />
Hongyu Tu <br />

In [1]:
import torch
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
tmp_lst = []

for i in ['danmu', 'comment']:
    with open('{}_token.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
    with open('{}_dist.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
        
danmu_token, danmu_dist, comment_token, comment_dist = tmp_lst

In [27]:
def tok_ten(token_lst, dist_lst):
    lst = [tokenizer(i)['input_ids'][1:-1] for i in token_lst]
    max_length = np.max(np.array([len(i) for i in lst]))
    tmp_lst = []
    for i in range(len(lst)):
        tmp = np.concatenate([lst[i], -np.ones(1 + max_length - len(lst[i]))]) 
        tmp_lst.append(tmp)
    output = torch.from_numpy(np.array(tmp_lst).astype(np.float32)).to(device=device), \
             torch.from_numpy(np.array(dist_lst).astype(np.float32)).to(device=device)
    return output

In [28]:
danmu_x, danmu_y = tok_ten(danmu_token, danmu_dist)
comment_x, comment_y = tok_ten(comment_token, comment_dist)

In [29]:
from torch.autograd import Variable

In [30]:
X_train, X_test, y_train, y_test = train_test_split(danmu_x, danmu_y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [31]:
X_train.shape[0], X_val.shape[0], X_test.shape[0]

(269913, 33739, 33740)

In [33]:
inputDim, outputDim = X_train.shape[1], y_train.shape[1]

In [41]:
class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.linear = torch.nn.Linear(inputSize, outputSize)

    def forward(self, x):
        out = self.linear(x)
        return out

model = linearRegression(inputDim, outputDim)
model.to(device=device)

linearRegression(
  (linear): Linear(in_features=81, out_features=126, bias=True)
)

In [42]:
learningRate = 0.01 
epochs = 10000
criterion = loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)

In [43]:
for epoch in range(epochs):
    inputs = Variable(X_train)
    labels = Variable(y_train)

    # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
    optimizer.zero_grad()

    # get output from the model, given the inputs
    outputs = model(inputs)

    # get loss for the predicted output
    loss = criterion(outputs, labels)
    if epoch % 1000 == 0:
        print(loss)
    # get gradients w.r.t to parameters
    loss.backward()

    # update parameters
    optimizer.step()
    
    if epoch % 1000 == 0:
        print('epoch {}, loss {}'.format(epoch, loss.item()))

tensor(1259.1462, device='cuda:0', grad_fn=<DivBackward1>)
epoch 0, loss 1259.146240234375
tensor(98835.6719, device='cuda:0', grad_fn=<DivBackward1>)
epoch 1000, loss 98835.671875
tensor(92721.9141, device='cuda:0', grad_fn=<DivBackward1>)
epoch 2000, loss 92721.9140625
tensor(85240.5781, device='cuda:0', grad_fn=<DivBackward1>)
epoch 3000, loss 85240.578125
tensor(82387.7500, device='cuda:0', grad_fn=<DivBackward1>)
epoch 4000, loss 82387.75
tensor(115718.8438, device='cuda:0', grad_fn=<DivBackward1>)
epoch 5000, loss 115718.84375
tensor(92544.4609, device='cuda:0', grad_fn=<DivBackward1>)
epoch 6000, loss 92544.4609375
tensor(106043.9297, device='cuda:0', grad_fn=<DivBackward1>)
epoch 7000, loss 106043.9296875
tensor(85951.0078, device='cuda:0', grad_fn=<DivBackward1>)
epoch 8000, loss 85951.0078125
tensor(104069.9141, device='cuda:0', grad_fn=<DivBackward1>)
epoch 9000, loss 104069.9140625
