In [1]:
# Import packages
import time
import pandas as pd
import numpy as np
import seaborn as sb
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm 
from torch.utils.data import Dataset,DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn import manifold
from collections import Counter
import re
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW, AutoModel, get_linear_schedule_with_warmup
from sklearn.utils import shuffle
from sklearn.metrics import classification_report

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
torch.cuda.set_device(1)
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

In [4]:
torch.cuda.current_device()

1

In [5]:
!mkdir -p models
!mkdir -p cache

# Load Data
Here Ban means either to place user in time out or to permanently hide the user's comments on the channel's current and future live streams. This mixup is due to the fact that these actions are indistinguishable from others with the extracted data from markChatItemsByAuthorAsDeletedAction event.

In [6]:
chats = pd.read_csv('data/chats_2021-05.csv', na_values='', keep_default_na=False)

In [7]:
#ban = pd.read_csv('data/ban_events.csv', usecols=['channelId', 'originVideoId'],na_values='', keep_default_na=False)
delet = pd.read_csv('data/deletion_events.csv',na_values='', keep_default_na=False)

In [8]:
delet = delet[delet['retracted'] == 0]
delet['deleted'] = True #mark the chat that be deleted
chat_df = pd.merge(chats, delet[['id', 'deleted']], how='left')
chat_df['deleted'].fillna(False, inplace=True)

## Filter out non-Japanese

In [9]:
jpkr = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7A3]')
jp = re.compile(r'[\u3040-\u309F\u30A0-\u30FF]')

In [10]:
chat_jpkr = chat_df[chat_df['body'].apply(lambda x: True if jpkr.search(x) else False)]
chat_jp = chat_df[chat_df['body'].apply(lambda x: True if jp.search(x) else False)]

In [11]:
print('original length : ', len(chat_df))
print('korean/japanese length : ', len(chat_jpkr))
print('only japanese length : ', len(chat_jp))

original length :  75726648
korean/japanese length :  45200323
only japanese length :  45049772


In [12]:
print('original deletion: ', len(chat_df[chat_df['deleted'] == True]))
print('japanese/korean deletion: ', len(chat_jpkr[chat_jpkr['deleted'] == True]))
print('only japanese deletion: ', len(chat_jp[chat_jp['deleted'] == True]))

original deletion:  13120
japanese/korean deletion:  2266
only japanese deletion:  2247


## Data Preprocessing

### Label

In [13]:
chat_jp.head()

Unnamed: 0,timestamp,body,membership,isModerator,isVerified,id,channelId,originVideoId,originChannelId,deleted
0,2021-05-01T00:00:00.112000+00:00,そうじゃないｗ,1 year,0,0,66b9d029a3e93df01b2626a77a5230c71abe5890,606b88eef42cc40a9e055d9af6deaf5e76244c02,S8tYbUIoHM0,UCp-5t9SrOQwXMU7iIjQfARg,False
1,2021-05-01T00:00:00.141000+00:00,いーやバナナだね,2 months,0,0,63cc28b37c760c52156ad2ea8d3e4036a29b19d9,ed808b843c98965376208c6e7aeb12ee122aa9f1,TfRFrbFbE2k,UChUJbHiTVeGrSkTdBzVfNCQ,False
3,2021-05-01T00:00:00.172000+00:00,大成功でしょ,non-member,0,0,ff9d8bee4c9608299f7c99a286efd19de1cef0ec,116707993d9886a42c32bc5a1a2b9db2e3524e50,S8tYbUIoHM0,UCp-5t9SrOQwXMU7iIjQfARg,False
4,2021-05-01T00:00:00.216000+00:00,大成功やろ！,non-member,0,0,b7b75f473487a932093e1852eafd2e7741c74e28,d8b410fab159d1c7c6aef425312d3d948c26ee8d,S8tYbUIoHM0,UCp-5t9SrOQwXMU7iIjQfARg,False
7,2021-05-01T00:00:00.387000+00:00,寝てもろてｗ,2 years,0,0,673ce821fb45b6b634b9d513024b361cf2550074,972dd2fed963ba20a7e1e30a09a1314c4730a800,S8tYbUIoHM0,UCp-5t9SrOQwXMU7iIjQfARg,False


In [14]:
# 轉化成 1 跟 0 的 label
chat_jp['label'] = chat_jp['deleted'].apply(lambda x: 1 if x == True else 0)
Counter(chat_jp['label'])

Counter({0: 45047525, 1: 2247})

In [15]:
print(len(chat_jp[chat_jp['label'] == 1]))

2247


### Membership

In [16]:
labelencoder = LabelEncoder()
chat_jp["membership"] = labelencoder.fit_transform(chat_jp["membership"])
# chat_jp.head(5)

# Experiment

## 0. Sample Data
- Hyper parameter: sample_rate (normal / deleted)

In [17]:
del_length = len(chat_jp[chat_jp['deleted']==True])
length = len(chat_jp[chat_jp['deleted']==False])
sample_rate = 2

deleted_sample = chat_jp[chat_jp['deleted']==True]
print('Deleted Sample : ', len(deleted_sample))
normal_sample = chat_jp[chat_jp['deleted']==False].sample((sample_rate*len(deleted_sample)))
print('Normal Sample : ', len(normal_sample))
                                                          
sample_chats = normal_sample.append(deleted_sample, ignore_index=True)
sample_chats = shuffle(sample_chats).reset_index(drop=True)                                        

Deleted Sample :  2247
Normal Sample :  4494


In [18]:
split_rate = 0.8
print("Num of all data: {}".format(len(sample_chats)))
train_data = sample_chats[:int(len(sample_chats)*split_rate)].reset_index(drop=True)
valid_data = sample_chats[int(len(sample_chats)*split_rate):].reset_index(drop=True)
print("Num of training data: {}".format(len(train_data)))
deleted_train = len(train_data[train_data['deleted']==True])
normal_train = len(train_data[train_data['deleted']==False])
print("Deleted/Normal: {}/{}".format(deleted_train, normal_train))
print("Num of validation data: {}".format(len(valid_data)))
deleted_valid = len(valid_data[valid_data['deleted']==True])
normal_valid = len(valid_data[valid_data['deleted']==False])
print("Deleted/Normal: {}/{}".format(deleted_valid, normal_valid))

Num of all data: 6741
Num of training data: 5392
Deleted/Normal: 1790/3602
Num of validation data: 1349
Deleted/Normal: 457/892


## 1. bert-base-japanese-whole-word-masking

In [19]:
class Vtuber_Dataset(Dataset):
    def __init__(self, df, tokenizer):
        super(Vtuber_Dataset,self).__init__()
        self.df = df
        self.tokenizer = tokenizer
        
    def __getitem__(self,idx):
        body = self.df.loc[idx,"body"][:512]
        target = self.df.loc[idx,"label"]

            
        res = tokenizer(body, return_tensors="pt",padding = "max_length", max_length = 512)
        input_ids = res["input_ids"].squeeze(0)
        att_mask  = res["attention_mask"].squeeze(0)
        try:
            assert input_ids.shape[0] == 512
        except:
            print("error found")
            body = self.df.loc[idx+1,"body"]
            target = torch.tensor(self.df.loc[idx+1,"label"])


            res = tokenizer(body, return_tensors="pt",padding = "max_length")
            input_ids = res["input_ids"].squeeze(0)
            att_mask  = res["attention_mask"].squeeze(0)
            
        return input_ids,att_mask,target
    
    def __len__(self):
        return len(self.df)
        

In [20]:
class vtu_model(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = BertForSequenceClassification.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
        
    def forward(self,**inputs):
        pool_feature = self.backbone(**inputs)
        return pool_feature

In [21]:
def acc_test(preds, targets, device):
    preds = torch.FloatTensor(preds)
    targets = torch.FloatTensor(targets)
    preds = preds.to(device, dtype = torch.int64)
    targets = targets.to(device)
    
    correct_results_sum = 0
    correct_results_sum = (preds == targets).sum().float()
    
    return (correct_results_sum / len(targets))*100, preds

In [22]:
tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')
bert = AutoModel.from_pretrained('cl-tohoku/bert-base-japanese')
# model =  BertForSequenceClassification.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

In [23]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
        super(BERT_Arch, self).__init__()

        self.bert = bert 
      
        # dropout layer
        self.dropout = nn.Dropout(0.1)
      
        # relu activation function
        self.relu =  nn.ReLU()
 
        # dense layer 1
        self.fc1 = nn.Linear(768,512)
      
        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,2)

        #softmax activation function
        self.softmax = nn.Softmax(dim=1)

    #define the forward pass
    def forward(self, input_ids, attention_mask):

        #pass the inputs to the model  
        outputs = self.bert(input_ids, attention_mask = attention_mask)
        
        cls_hs = outputs.last_hidden_state[:, -1, :]
      
        x = self.fc1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)
      
        # apply softmax activation
        x = self.softmax(x)

        return x

In [24]:
train_dataset = Vtuber_Dataset(train_data, tokenizer)
valid_dataset = Vtuber_Dataset(valid_data, tokenizer)
trainloader = DataLoader(train_dataset, batch_size = 1, shuffle = True)
validloader = DataLoader(valid_dataset, batch_size = 1, shuffle = True)

In [25]:
model = BERT_Arch(bert)
model = model.to(device)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 1e-2,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,eps=1e-8)
criterion = torch.nn.CrossEntropyLoss()

gradient_accumulation_steps = 32
total_steps = len(trainloader)// gradient_accumulation_steps * 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)


In [26]:
def train(model, trainloader, device, optimizer, scheduler, gradient_accumulation_steps):
    model.train()
    loss_total = 0
    acc_total = 0
    logit_list = []
    label_list = []

    for step, batch in enumerate(trainloader):
        input_ids, attention_mask, label = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        label = label.to(device)
        model = model.to(device)
        outputs = model(input_ids, attention_mask)
        logit = outputs
        
        
#         print(logit)

        loss = criterion(logit, label)

        loss_total += loss.item()
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps

        loss.backward()

        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        if (step + 1) % gradient_accumulation_steps == 0:
            # Clip the norm of the gradients to 1.0.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step() # update all parameters
            scheduler.step() # Update learning rate schedule
            optimizer.zero_grad() # initialize the gradient so that it wont repeat accumulate itself(update the params)
            model.zero_grad()
            # print("in step:%s,loss:%s"%(str(step),str(loss)),end = "\r")
            
        _, logit = torch.max(logit, dim=1)
        
        logit = logit.tolist()
        label = label.tolist()
        logit_list += logit
        label_list += label

    acc,pred_logit = acc_test(logit_list, label_list, device)
        
    return acc, loss_total/len(trainloader)

In [27]:
def valid(model, validloader, device, scheduler):
    loss_total = 0
    acc_total = 0
    logit_list = []
    label_list = []
    model.eval()
    with torch.no_grad():
        for step, batch in enumerate(validloader):
            input_ids, attention_mask, label= batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            label = label.to(device)
            model = model.to(device)
            outputs = model(input_ids, attention_mask)
#             loss = outputs.loss
            logit = outputs
    
            loss = criterion(logit, label)
            loss_total += loss
            
            _, logit = torch.max(logit, dim=1)

            logit = logit.tolist()
            label = label.tolist()
            logit_list += logit
            label_list += label
    
    acc_valid, pred_logit = acc_test(logit_list, label_list, device)
    return acc_valid, loss_total/len(validloader), label_list, logit_list

In [28]:
loss = 1000000
best_loss = 0
best_label_list = []
best_logit_list = []
for epoch in range(10):
    train_acc, train_loss= train(model, trainloader, device, optimizer, scheduler, gradient_accumulation_steps)
    valid_acc, valid_loss, label_list, logit_list = valid(model, validloader, device, scheduler)
    
    if valid_loss < loss:
        best_loss = valid_loss
        best_label_list = label_list
        best_logit_list = logit_list
    
    print(f'Epoch:{epoch+1:2d}||', 
            f'|| train loss:{train_loss:.4f}',
            f'|| valid loss:{valid_loss:.4f}',
            f'|| train acc:{train_acc:.4f}'
            f'|| valid acc:{valid_acc:.4f}',
            )

Epoch: 1|| || train loss:0.5347 || valid loss:0.5009 || train acc:76.7619|| valid acc:80.3558
Epoch: 2|| || train loss:0.4724 || valid loss:0.4898 || train acc:83.5497|| valid acc:81.6901
Epoch: 3|| || train loss:0.4399 || valid loss:0.4838 || train acc:86.9622|| valid acc:82.2832
Epoch: 4|| || train loss:0.4162 || valid loss:0.4825 || train acc:89.5401|| valid acc:82.2832
Epoch: 5|| || train loss:0.4001 || valid loss:0.4838 || train acc:91.2834|| valid acc:82.4314
Epoch: 6|| || train loss:0.3974 || valid loss:0.4838 || train acc:91.5059|| valid acc:82.4314
Epoch: 7|| || train loss:0.3981 || valid loss:0.4838 || train acc:91.5245|| valid acc:82.4314
Epoch: 8|| || train loss:0.3976 || valid loss:0.4838 || train acc:91.4874|| valid acc:82.4314
Epoch: 9|| || train loss:0.3956 || valid loss:0.4838 || train acc:91.8027|| valid acc:82.4314
Epoch:10|| || train loss:0.3974 || valid loss:0.4838 || train acc:91.3947|| valid acc:82.4314
Epoch:11|| || train loss:0.3989 || valid loss:0.4838 || trai

In [29]:
print(classification_report(best_label_list, best_logit_list))

              precision    recall  f1-score   support

           0       0.82      0.94      0.88       892
           1       0.83      0.60      0.70       457

    accuracy                           0.82      1349
   macro avg       0.83      0.77      0.79      1349
weighted avg       0.83      0.82      0.82      1349

