# Data pre-processing & tokenization

CS685 Spring 2022 <br />
Apr. 2, 2022<br />
Hongyu Tu <br />

In [15]:
import torch
import pickle
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
tmp_lst = []

for i in ['danmu', 'comment']:
    with open('{}_token.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
    with open('{}_dist.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
        
danmu_token, danmu_dist, comment_token, comment_dist = tmp_lst

In [5]:
def tok_ten(token_lst, dist_lst):
    lst = [tokenizer(i)['input_ids'][1:-1] for i in token_lst]
    max_length = np.max(np.array([len(i) for i in lst]))
    tmp_lst = []
    for i in range(len(lst)):
        tmp = np.concatenate([lst[i], -np.ones(1 + max_length - len(lst[i]))]) 
        tmp_lst.append(tmp)
    output = torch.from_numpy(np.array(tmp_lst).astype(np.float32)).to(device=device), \
             torch.from_numpy(np.array(dist_lst).astype(np.float32)).to(device=device)
    return output

In [6]:
danmu_x, danmu_y = tok_ten(danmu_token, danmu_dist)
comment_x, comment_y = tok_ten(comment_token, comment_dist)

In [7]:
from torch.autograd import Variable

In [8]:
X_train, X_test, y_train, y_test = train_test_split(danmu_x, danmu_y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [9]:
X_train.shape[0], X_val.shape[0], X_test.shape[0]

(269913, 33739, 33740)

In [10]:
inputDim, outputDim = X_train.shape[1], y_train.shape[1]

In [11]:
from torch import nn

In [12]:
class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(inputSize, 200),
            nn.ReLU(),
            nn.Linear(200, 200),
            nn.ReLU(),
            nn.Linear(200, 100),
            nn.ReLU(),
            nn.Linear(100, outputSize),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.layers(x)

model = linearRegression(inputDim, outputDim)
model.to(device=device)

linearRegression(
  (layers): Sequential(
    (0): Linear(in_features=81, out_features=200, bias=True)
    (1): ReLU()
    (2): Linear(in_features=200, out_features=200, bias=True)
    (3): ReLU()
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU()
    (6): Linear(in_features=100, out_features=126, bias=True)
    (7): Softmax(dim=1)
  )
)

In [16]:
learningRate = 1e-5 
epochs = 1000
criterion = loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)

In [18]:
for epoch in tqdm(range(epochs)):
    inputs = Variable(X_train)
    labels = Variable(y_train)
    
    # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
    optimizer.zero_grad()

    # get output from the model, given the inputs
    outputs = model(inputs)

    # get loss for the predicted output
    loss = criterion(outputs, labels)
    # get gradients w.r.t to parameters
    loss.backward()

    # update parameters
    optimizer.step()
    
    if epoch % 100 == 0:
        print('epoch {}, loss {}'.format(epoch, loss.item()))

  0%|                                        | 1/1000 [00:05<1:31:51,  5.52s/it]

epoch 0, loss 4.84377908706665


  0%|▏                                       | 4/1000 [00:23<1:39:02,  5.97s/it]


KeyboardInterrupt: 