In [56]:
from transformers import DistilBertModel, DistilBertTokenizer, Trainer, TrainingArguments
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
import pandas as pd
import pytorch_lightning as pl
import orjson

from datasets import Dataset, DatasetDict
from datasets import load_metric
from torchmetrics import functional as F

# Build Model

In [2]:
class DistilFakeBert(nn.Module):
    def __init__(self):
        super(DistilFakeBert, self).__init__()
        
        self.distil_bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(0.25)
        self.fc1 = nn.Linear(768, 256)
        self.relu = nn.ReLU()
        self.output = nn.Linear(256, 2)
        
    def forward(self, input_ids, attention_mask):
        embeddings = self.distil_bert(
            input_ids, attention_mask=attention_mask
        )[0][:, 0]
        embeddings = self.fc1(embeddings)
        embeddings = self.relu(embeddings)
        embeddings = self.dropout(embeddings)
        output = self.output(embeddings)
        
        return output

In [3]:
model = DistilFakeBert()
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load data and transform

In [4]:
df = pd.read_csv('data_text/news.csv')

In [9]:
text = df['text'].tolist()
labels = df['label'].tolist()

In [13]:
train_text, valid_text, train_label, valid_label = train_test_split(text, labels, stratify=labels, random_state=2021, test_size=0.3)
valid_text, test_text, valid_label, test_label = train_test_split(valid_text, valid_label, stratify=valid_label, random_state=2021, test_size=0.5)

In [40]:
data = {
    'train': Dataset.from_dict({
        'text': train_text,
        'label': train_label
    }),
    'valid': Dataset.from_dict({
        'text': valid_text,
        'label': valid_label
    }),
    'test': Dataset.from_dict({
        'text': test_text,
        'label': test_label
    })
}

In [41]:
dataset = DatasetDict(data)

In [100]:
# Write a tokenizer function
def tokenize_data(data):
    return tokenizer(
        data['text'], truncation=True, max_length=300, padding='max_length', 
        add_special_tokens=True, 
    )

In [101]:
encoded_dataset = dataset.map(tokenize_data, batched=True)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [102]:
columns_to_return = ['input_ids', 'attention_mask', 'label']
encoded_dataset.set_format(type='torch', columns=columns_to_return)

In [103]:
encoded_dataset['train']

TypeError: can't convert np.ndarray of type numpy.str_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [80]:
train_args = TrainingArguments(
    'distil-classifier',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to='none'
)

In [59]:
metric = load_metric('accuracy')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1362.0, style=ProgressStyle(description…




In [60]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [93]:
trainer = Trainer(
    model, train_args, train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['valid'], tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [94]:
trainer.train()

TypeError: new(): invalid data type 'numpy.str_'