In [18]:
import numpy as np
import pandas as pd

import logging

# Hide logging messages.
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
%env CATALYST_LOG_LEVEL = 15
#!pip install tensorflow-macos
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch
from sklearn import metrics
from torch import cuda

env: CATALYST_LOG_LEVEL=15


In [19]:
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')

#set(val_df['author'].tolist()+train_df['author'].tolist())

In [20]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 2e-05
NUM_CLASSES = len(train_df.drop('text', axis=1).columns)
device = 'cuda' if cuda.is_available() else 'cpu'

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.drop('text', axis=1).values
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        inputs = self.tokenizer.encode_plus(
            self.text[index],
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long.to(device, dtype=torch.long)),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(device, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float).to(device, dtype = torch.float)
        }


training_set = CustomDataset(train_df.reset_index(drop=True), tokenizer, MAX_LEN)
val_set = CustomDataset(val_df.reset_index(drop=True), tokenizer, MAX_LEN)
test_set = CustomDataset(test_df.reset_index(drop=True), tokenizer, MAX_LEN)
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 0
               }

training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **test_params)
test_loader = DataLoader(test_set, **test_params)


In [22]:
model = BertModel.from_pretrained(model_name)

for param in model.parameters():
    param.requires_grad = False

#model.pooler = torch.nn.AdaptiveMaxPool1d(1)
# Replace the classification layer with a linear layer
model.classifier = torch.nn.Sequential(
    torch.nn.Dropout(0.1),
    torch.nn.Linear(model.config.hidden_size, NUM_CLASSES)
)

# Unfreeze the weights of the last 4 layers
for param in model.encoder.layer[-4:].parameters():
    param.requires_grad = True

# Fine-tune the model on your downstream task
train_dataset = train_df
dev_dataset = val_df
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

model.to(device)
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [23]:
# train_inputs = tokenizer(train_dataset['text'].tolist(),
#                          padding=True,
#                          truncation=True,
#                          return_tensors='pt')
# train_labels = torch.tensor(train_dataset.drop('text', axis=1).values) \
#     .to(device, dtype=torch.float)
# val_inputs = tokenizer(dev_dataset['text'].tolist(), padding=True, truncation=True, return_tensors='pt')
# val_labels = torch.tensor(dev_dataset.drop('text', axis=1).values) \
#     .to(device, dtype=torch.float)


In [25]:
for epoch in range(EPOCHS):
    model.train()
    for _, data in enumerate(training_loader, 0):
        outputs = model(data['ids'],
                        attention_mask=data['mask'],
                        token_type_ids=data['token_type_ids'])
        pooled_output = outputs.pooler_output
        # print(pooled_output.shape)
        # pooled_output = model.pooler(pooled_output.permute(0, 2, 1)).squeeze()
        logits = model.classifier(pooled_output)
        #print(logits.shape)
        loss_fn = torch.nn.BCEWithLogitsLoss()
        loss = loss_fn(logits, data['targets'])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if _ % 100 == 0:
            print(f"Epoch {epoch + 1}, Loss= {loss.item()}")
    # Evaluate the model on the dev set
    with torch.no_grad():
        model.eval()
        fin_targets = []
        fin_outputs = []
        for _, data in enumerate(val_df, 0):
            outputs = model(data['ids'],
                            attention_mask=data['mask'],
                            token_type_ids=data['token_type_ids'])
            pooled_output = outputs.pooler_output
            # pooled_output = model.pooler(pooled_output.permute(0, 2, 1)).squeeze()
            logits = model.classifier(pooled_output)
            preds = torch.argmax(logits, dim=1)
            labels = torch.argmax(data['targets'], dim=1)
            fin_targets.extend(labels)
            fin_outputs.extend(preds)
        acc = (fin_targets == fin_outputs).float().mean().item()
        print(f"Epoch {epoch + 1}, Dev accuracy = {acc}")

# Use the fine-tuned model to make predictions on new data
with torch.no_grad():
    fin_targets = []
    fin_outputs = []
    for _, data in enumerate(test_df, 0):
        outputs = model(data['input_ids'],
                        attention_mask=data['mask'],
                        token_type_ids=data['token_type_ids'])
        pooled_output = outputs.pooler_output
        logits = model.classifier(pooled_output)
        preds = torch.argmax(logits, dim=1)
        fin_targets.extend(labels)
        fin_outputs.extend(preds)
    test_df['truth'] = fin_targets
    test_df['pred'] = fin_outputs

test_df['text', 'truth', 'pred']

Epoch 1, Loss= 0.6688206791877747


KeyboardInterrupt: 