In [1]:
%cd ../../

d:\online_predatory_conversation_detection


In [17]:
import torch
import pandas as pd

from transformers import AutoTokenizer, BertTokenizer, \
    DataCollatorForLanguageModeling, AutoModelForMaskedLM, \
    TrainingArguments, Trainer

from torch.utils.data import Dataset
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [26]:
model_name = "distilroberta-base"

class MyDataset(Dataset):

    def __init__(self, file_path, tokenizer) -> None:
        super().__init__()
        self.file_path = file_path
        self.tokenizer = tokenizer
        
        df = pd.read_csv(self.file_path)
        df["text"].fillna("", inplace=True)
        self.input_ids, self.attention_mask = [None] * df.shape[0], [None] * df.shape[0]
        for i, txt in enumerate(tqdm(df["text"].to_list())):
            tmp = self.tokenizer(txt, truncation=True, padding="max_length", return_tensors='pt')
            self.input_ids[i] = tmp["input_ids"]
            self.attention_mask[i] = tmp["attention_mask"]
        self.input_ids = torch.cat(self.input_ids, dim=0)
        self.attention_mask = torch.cat(self.attention_mask, dim=0)
        del df

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {"input_ids": self.input_ids[idx], "attention_mask": self.attention_mask[idx]}


In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset_path = "data/dataset-v2/train.csv"
dataset = MyDataset(dataset_path, tokenizer)
# df = pd.read_csv(dataset_path)
# df["text"].fillna("", inplace=True)

# df.head()

100%|██████████| 903607/903607 [07:19<00:00, 2055.38it/s]


In [28]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [29]:
model = AutoModelForMaskedLM.from_pretrained(model_name)

In [32]:
training_args = TrainingArguments(
    output_dir="output-pretraining",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    per_device_train_batch_size=16
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    # eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

In [33]:
trainer.train()

  0%|          | 0/338853 [55:46<?, ?it/s]
                                                       
  0%|          | 500/169428 [05:34<31:29:38,  1.49it/s] 

{'loss': 0.06, 'learning_rate': 1.9940977878508866e-05, 'epoch': 0.01}


                                                        
  1%|          | 1000/169428 [11:08<30:50:47,  1.52it/s]

{'loss': 0.0598, 'learning_rate': 1.9881955757017733e-05, 'epoch': 0.02}


                                                        
  1%|          | 1500/169428 [16:36<29:09:09,  1.60it/s]

{'loss': 0.06, 'learning_rate': 1.9822933635526598e-05, 'epoch': 0.03}


                                                         
  1%|          | 2000/169428 [21:53<29:13:21,  1.59it/s]

{'loss': 0.0588, 'learning_rate': 1.9763911514035462e-05, 'epoch': 0.04}


                                                         
  1%|▏         | 2500/169428 [27:22<30:31:13,  1.52it/s]

{'loss': 0.0593, 'learning_rate': 1.9704889392544326e-05, 'epoch': 0.04}


                                                        
  2%|▏         | 3000/169428 [32:45<28:47:20,  1.61it/s]  

{'loss': 0.0582, 'learning_rate': 1.964586727105319e-05, 'epoch': 0.05}


                                                        
  2%|▏         | 3500/169428 [38:04<28:55:08,  1.59it/s]  

{'loss': 0.0646, 'learning_rate': 1.9586845149562058e-05, 'epoch': 0.06}


                                                        
  2%|▏         | 4000/169428 [43:20<28:19:54,  1.62it/s]  

{'loss': 0.0629, 'learning_rate': 1.9527823028070922e-05, 'epoch': 0.07}


                                                        
  3%|▎         | 4500/169428 [48:35<28:33:57,  1.60it/s]  

{'loss': 0.0636, 'learning_rate': 1.946880090657979e-05, 'epoch': 0.08}


                                                        
  3%|▎         | 5000/169428 [53:50<29:15:13,  1.56it/s]  

{'loss': 0.0614, 'learning_rate': 1.9409778785088654e-05, 'epoch': 0.09}


  3%|▎         | 5061/169428 [54:34<30:33:09,  1.49it/s]