# Data pre-processing & tokenization

CS685 Spring 2022 <br />
Apr. 2, 2022<br />
Hongyu Tu <br />

In [2]:
import torch
import pickle
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
tmp_lst = []

for i in ['danmu', 'comment']:
    with open('{}_token.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
    with open('{}_dist.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
        
danmu_token, danmu_dist, comment_token, comment_dist = tmp_lst

In [7]:
def tok_ten(token_lst, dist_lst):
    lst = [tokenizer(i)['input_ids'][1:-1] for i in token_lst]
    max_length = np.max(np.array([len(i) for i in lst]))
    tmp_lst = []
    for i in range(len(lst)):
        tmp = np.concatenate([lst[i], -np.ones(1 + max_length - len(lst[i]))]) 
        tmp_lst.append(tmp)
    output = torch.from_numpy(np.array(tmp_lst).astype(np.float32)).to(device=device), \
             torch.from_numpy(np.array(dist_lst).astype(np.float32)).to(device=device)
    return output

In [8]:
danmu_x, danmu_y = tok_ten(danmu_token, danmu_dist)
comment_x, comment_y = tok_ten(comment_token, comment_dist)

In [9]:
from torch.autograd import Variable

In [10]:
X_train, X_test, y_train, y_test = train_test_split(danmu_x, danmu_y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [11]:
X_train.shape[0], X_val.shape[0], X_test.shape[0]

(269913, 33739, 33740)

In [12]:
inputDim, outputDim = X_train.shape[1], y_train.shape[1]

In [13]:
from torch import nn

In [14]:
class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(inputSize, 200),
            nn.ReLU(),
            nn.Linear(200, 200),
            nn.ReLU(),
            nn.Linear(200, 100),
            nn.ReLU(),
            nn.Linear(100, outputSize),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.layers(x)

model = linearRegression(inputDim, outputDim)
model.to(device=device)

linearRegression(
  (layers): Sequential(
    (0): Linear(in_features=81, out_features=200, bias=True)
    (1): ReLU()
    (2): Linear(in_features=200, out_features=200, bias=True)
    (3): ReLU()
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU()
    (6): Linear(in_features=100, out_features=126, bias=True)
    (7): Softmax(dim=1)
  )
)

In [15]:
learningRate = 1e-5 
epochs = 10000
criterion = loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)

In [16]:
for epoch in tqdm(range(epochs)):
    inputs = Variable(X_train)
    labels = Variable(y_train)
    
    # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
    optimizer.zero_grad()

    # get output from the model, given the inputs
    outputs = model(inputs)

    # get loss for the predicted output
    loss = criterion(outputs, labels)
    # get gradients w.r.t to parameters
    loss.backward()

    # update parameters
    optimizer.step()
    
    if epoch % 1000 == 0:
        print('epoch {}, loss {}'.format(epoch, loss.item()))

  0%|▏                                                                              | 18/10000 [00:00<06:53, 24.15it/s]

epoch 0, loss 4.844509601593018


 10%|███████▊                                                                     | 1019/10000 [00:36<04:25, 33.86it/s]

epoch 1000, loss 4.836912631988525


 20%|███████████████▌                                                             | 2019/10000 [01:11<03:55, 33.83it/s]

epoch 2000, loss 4.8098368644714355


 30%|███████████████████████▏                                                     | 3019/10000 [01:47<03:29, 33.27it/s]

epoch 3000, loss 4.790591239929199


 40%|██████████████████████████████▉                                              | 4018/10000 [02:23<02:56, 33.96it/s]

epoch 4000, loss 4.785711288452148


 50%|██████████████████████████████████████▋                                      | 5018/10000 [02:59<02:27, 33.70it/s]

epoch 5000, loss 4.783388137817383


 60%|██████████████████████████████████████████████▎                              | 6018/10000 [03:35<01:59, 33.38it/s]

epoch 6000, loss 4.782073020935059


 70%|██████████████████████████████████████████████████████                       | 7018/10000 [04:11<01:28, 33.67it/s]

epoch 7000, loss 4.781394958496094


 80%|█████████████████████████████████████████████████████████████▋               | 8018/10000 [04:47<00:58, 33.60it/s]

epoch 8000, loss 4.781059265136719


 90%|█████████████████████████████████████████████████████████████████████▍       | 9018/10000 [05:23<00:29, 33.63it/s]

epoch 9000, loss 4.7808756828308105


100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [05:58<00:00, 27.91it/s]
