In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('./data/processed/train_set.csv')
test_data = pd.read_csv('./data/processed/test_set.csv')
valid_data = pd.read_csv('./data/processed/valid_set.csv')

train_texts = train_data['text_cleaned'].tolist()
train_labels = train_data['sensitive'].tolist()

test_texts = test_data['text_cleaned'].tolist()
test_labels = test_data['sensitive'].tolist()

valid_texts = valid_data['text_cleaned'].tolist()
valid_labels = valid_data['sensitive'].tolist()

In [3]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [4]:
# TEST - reduce train/test/valid data size to 30% of the original size
train_texts = train_texts[:int(len(train_texts)*0.3)]
train_labels = train_labels[:int(len(train_labels)*0.3)]
test_texts = test_texts[:int(len(test_texts)*0.3)]
test_labels = test_labels[:int(len(test_labels)*0.3)]
valid_texts = valid_texts[:int(len(valid_texts)*0.3)]
valid_labels = valid_labels[:int(len(valid_labels)*0.3)]

In [5]:
def tokenize_data(texts, labels, tokenizer, max_length=512):
    res = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt"), labels
    print("Tokenization done!")
    return res

train_encodings, train_labels = tokenize_data(train_texts, train_labels, tokenizer)
test_encodings, test_labels = tokenize_data(test_texts, test_labels, tokenizer)
valid_encodings, valid_labels = tokenize_data(valid_texts, valid_labels, tokenizer)

In [None]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

class SensitiveDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SensitiveDataset(train_encodings, train_labels)
valid_dataset = SensitiveDataset(valid_encodings, valid_labels)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results', 
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',           
    logging_steps=10,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

trainer.train()



[2025-01-06 13:50:24,371] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to mps (auto detect)


W0106 13:50:26.223000 8591658816 torch/distributed/elastic/multiprocessing/redirects.py:28] NOTE: Redirects are currently not supported in Windows or MacOs.


  0%|          | 0/53253 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


KeyboardInterrupt: 