<a href="https://colab.research.google.com/github/dynle/youtube-hate-speech-classification/blob/master/youtube_hate_speech_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Youtube Hate Speech Classification by BERT**

## Get dataset file from Kaggle and edit the structure of columns
https://www.kaggle.com/surekharamireddy/malignant-comment-classification


In [None]:
import pandas as pd
df = pd.read_csv('./dataset.csv')
df.head()

In [None]:
df.info

In data, 1 denotes a malignant comment, and 0 denotes a normal comment

## Delete \n character in each text data

In [3]:
df['comment_text']=df['comment_text'].str.replace("\n"," ")

## Grouped data based on 0 and 1 at 'isMalignant' column

In [None]:
grouped = df.groupby(df.isMalignant)

group_0 = grouped.get_group(0)
group_1 = grouped.get_group(1)
group_0.values

## Get 600 data from dataset and save it in txt file

In [5]:
with open('dataset.txt','w') as f:
  for line in group_0.values[:300]:
    f.write(str(line[0])+'\t'+line[1]+'\n')
  for line in group_1.values[:300]:
    f.write(str(line[0])+'\t'+line[1]+'\n')

## Shuffle the dataset and split into train and test dataset

In [None]:
!shuf dataset.txt -o shuffled.txt
!head -400 shuffled.txt > train.txt
!tail -200 shuffled.txt > test.txt

open('train.txt').readlines()

## BERTをPythonで使うライブラリをインストールします．

In [None]:
!pip install transformers==4.5.0 fugashi==1.1.0 ipadic==1.0.0 pytorch-lightning==1.2.10

## 学習データを読み込みます．

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

# 日本語の事前学習モデル
# MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
MODEL_NAME = 'bert-base-uncased'

# 学習データの読み込み
train_lines = [x.rstrip().split('\t')[1] for x in open("train.txt").readlines()]
train_labels = [int(x.split('\t')[0]) for x in open("train.txt").readlines()]

# テストデータの読み込み
test_lines = [x.rstrip().split('\t')[1] for x in open("test.txt").readlines()]
test_labels = [int(x.split('\t')[0]) for x in open("test.txt").readlines()]

# 単語分割モデルの読み込み
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

def create_dataset_for_loader(lines, labels):
  dataset_for_loader = []
  for i in range(len(lines)):
    encoding = tokenizer(lines[i],max_length=128,padding='max_length',truncation=True)
    encoding['labels'] = labels[i]
    encoding = { k: torch.tensor(v) for k, v in encoding.items() }
    dataset_for_loader.append(encoding)
  return dataset_for_loader

dataset_for_loader_train = create_dataset_for_loader(train_lines, train_labels)
dataset_for_loader_test = create_dataset_for_loader(test_lines, test_labels)

dataset_train = dataset_for_loader_train[50:] # 学習データ
dataset_val = dataset_for_loader_train[:50] # 検証データ
dataset_test = dataset_for_loader_test # 評価データ

dataloader_train = DataLoader(
    dataset_train, batch_size=16, shuffle=True
) 
dataloader_val = DataLoader(dataset_val, batch_size=16)
dataloader_test = DataLoader(dataset_test, batch_size=1)

## 以下のコードにより，モデルの定義をします．

In [None]:
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader

from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl

class BertForSequenceClassification_pl(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr):
        super().__init__()
        self.save_hyperparameters() 
        self.bert_sc = BertForSequenceClassification.from_pretrained(
            model_name, num_labels=num_labels)
        self.test_results = []
        
    def training_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log('train_loss', loss) 
        return loss
        
    def validation_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss)

    def reset_test_results(self):
        self.test_results = []

    def test_step(self, batch, batch_idx):
        labels = batch.pop('labels')
        output = self.bert_sc(**batch)
        probs = torch.nn.functional.softmax(output.logits,dim=-1)
        labels_predicted = output.logits.argmax(-1)
        num_correct = ( labels_predicted == labels ).sum().item()
        accuracy = num_correct/labels.size(0) 
        hyp = labels_predicted.cpu().numpy()[0]
        ref = labels.cpu().numpy()[0]
        prob = probs.cpu().numpy()[0][hyp]
        self.test_results.append({"hyp":hyp, "ref": ref, "prob":prob})
        self.log('accuracy', accuracy)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

# 学習時にモデルの重みを保存する条件を指定
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/',
)

# 学習が進まなくなったら終了する条件を指定
early_stopping = pl.callbacks.EarlyStopping(
    min_delta=0.00,
    patience=1,
    verbose=True,
    monitor='val_loss',
    mode='min',
)    

# 学習の方法を指定
trainer = pl.Trainer(
    gpus=1, 
    max_epochs=10,
    callbacks = [checkpoint,early_stopping]
)

# 学習に利用するモデルの作成
model = BertForSequenceClassification_pl(MODEL_NAME, num_labels=2, lr=1e-5)

## 学習の実行 (Takes some time)

In [10]:
trainer.fit(model, dataloader_train, dataloader_val) 
best_model_path = checkpoint.best_model_path # ベストモデルのファイル
print('ベストモデルのファイル: ', checkpoint.best_model_path)
print('ベストモデルの検証データに対する損失: ', checkpoint.best_model_score)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                          | Params
----------------------------------------------------------
0 | bert_sc | BertForSequenceClassification | 109 M 
----------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.935   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

ベストモデルのファイル:  /content/model/epoch=3-step=87.ckpt
ベストモデルの検証データに対する損失:  tensor(0.2587, device='cuda:0')


## 分類の実行と結果の表示．連番，正解ラベル，予測ラベル，確率，本文の順番で結果が出ます．

In [None]:
model.reset_test_results()
test = trainer.test(test_dataloaders=dataloader_test)

for i in range(len(test_lines)):
  line = test_lines[i]
  label = test_labels[i]
  d = model.test_results[i]
  hyp = d['hyp'].item()
  prob = d['prob'].item() 
  print(f"{i+1}\t{label}\t{hyp}\t{prob}\t{line}")

print(f'Accuracy: {test[0]["accuracy"]:.3f}')

### Classify youtube comments extracted by Youtube API whether each comment is a hate speech or not

In [None]:
!nkf -g tweets.txt

In [13]:
comment_lines = [x.strip() for x in open("comments.txt").readlines()]
comment_labels = [1 for x in comment_lines]

dataset_for_loader_tweet = create_dataset_for_loader(comment_lines, comment_labels)
dataloader_comment = DataLoader(dataset_for_loader_tweet, batch_size=1)

model.reset_test_results()
test = trainer.test(test_dataloaders=dataloader_comment)

for i in range(len(comment_lines)):
  line = comment_lines[i]
  d = model.test_results[i]
  hyp = d['hyp'].item()
  prob = d['prob'].item() 
  # if hyp == 1 and prob > 0.9: # 1である確率が0.9以上のものに厳選
  print(f"{i+1}\t{hyp}\t{prob}\t{line}")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'accuracy': 0.523809552192688}
--------------------------------------------------------------------------------
1	1	0.5596621632575989	0
2	0	0.78451007604599	"Like I said in the video, subscribe if you haven’t already and you could win $10,000!"
3	1	0.9686250686645508	How the hell much did you spend!?
4	0	0.6944490075111389	"This game is so much fun, Jimmy never stops making popular videos"
5	0	0.7304744720458984	who is 001
6	1	0.9269351363182068	Awwww its flamingo in there
7	0	0.5024513006210327	I have Bral stars
8	0	0.8498601317405701	A
9	1	0.6927601099014282	bro flamingo is here
10	0	0.91734379529953	"Can I have $1,000 4 my family"
11	0	0.5764891505241394	Done
12	1	0.9464684128761292	damnn
13	1	0.9385344982147217	"You,re crazy"
14	1	0.5770506858825684	I always die from glass or something when I’m a player
15	0	0.8838456273078918	I subcribe to you
16	1	0.591166615486145	We all 