In [1]:
!pip install -q git+https://github.com/huggingface/transformers.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 7.6 MB 32.5 MB/s 
[K     |████████████████████████████████| 163 kB 69.1 MB/s 
[?25h  Building wheel for transformers (PEP 517) ... [?25l[?25hdone


In [2]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import (TrainingArguments, Trainer, GPT2Config, GPT2Tokenizer, 
                          get_linear_schedule_with_warmup, GPT2ForSequenceClassification)

epochs = 5
batch_size = 32
max_length = 180
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name_or_path = 'gpt2'
labels_ids = {'negative': 0, 'neutral': 1, 'positive': 2}
n_labels = len(labels_ids)

In [3]:
class CustomDataset(Dataset):
  def __init__(self, csv_file, use_tokenizer):
    self.texts = []
    self.labels = []
    
    df = pd.read_csv(csv_file)
    df.drop('Unnamed: 0', axis = 1, inplace = True)
    df.columns = ['reviews', 'sentiment']
    for i in range(len(df)):
      self.texts.append(df.reviews[i])
      self.labels.append(df.sentiment[i])

    self.n_examples = len(self.labels)
    return

  def __len__(self):
    return self.n_examples

  def __getitem__(self, item):
    return {'text': self.texts[item], 'label': self.labels[item]}

class Gpt2ClassificationCollator(object):
    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len = None):
        self.use_tokenizer = use_tokenizer
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        self.labels_encoder = labels_encoder
        return

    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [sequence['label'] for sequence in sequences]
        labels = [self.labels_encoder[label] for label in labels]
        inputs = self.use_tokenizer(text = texts, return_tensors = "pt", padding = True, 
                                    truncation = True, max_length = self.max_sequence_len)
        inputs.update({'labels': torch.tensor(labels)})
        return inputs

def train(dataloader, optimizer_, scheduler_, device_):
  global model

  predictions_labels = []
  true_labels = []
  total_loss = 0

  model.train()

  for batch in tqdm(dataloader, total = len(dataloader)):
    true_labels += batch['labels'].numpy().flatten().tolist()
    batch = {k: v.type(torch.long).to(device_) for k, v in batch.items()}
    model.zero_grad()

    outputs = model(**batch)
    loss, logits = outputs[:2]
    total_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer_.step()
    scheduler_.step()

    logits = logits.detach().cpu().numpy()
    predictions_labels += logits.argmax(axis = -1).flatten().tolist()

  avg_epoch_loss = total_loss/len(dataloader)
  return true_labels, predictions_labels, avg_epoch_loss

def validation(dataloader, device_):
  global model

  predictions_labels = []
  true_labels = []
  total_loss = 0
  model.eval()

  for batch in tqdm(dataloader, total = len(dataloader)):
    true_labels += batch['labels'].numpy().flatten().tolist()
    batch = {k: v.type(torch.long).to(device_) for k, v in batch.items()}

    with torch.no_grad():
        outputs = model(**batch)
        loss, logits = outputs[:2]
        
        logits = logits.detach().cpu().numpy()

        total_loss += loss.item()
        predict_content = logits.argmax(axis = -1).flatten().tolist()
        predictions_labels += predict_content

  avg_epoch_loss = total_loss/len(dataloader)
  return true_labels, predictions_labels, avg_epoch_loss

In [4]:
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path = model_name_or_path, num_labels = n_labels)

tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path = model_name_or_path)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path = model_name_or_path, config = model_config)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id

model.to(device)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid

In [5]:
gpt2_classificaiton_collator = Gpt2ClassificationCollator(use_tokenizer = tokenizer, 
                                                          labels_encoder = labels_ids, 
                                                          max_sequence_len = max_length)

train_dataset = CustomDataset('train.csv', use_tokenizer = tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, 
                              shuffle = True, collate_fn = gpt2_classificaiton_collator)

val_dataset =  CustomDataset('test.csv', use_tokenizer = tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size = batch_size, 
                              shuffle = False, collate_fn = gpt2_classificaiton_collator)

In [6]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

all_loss = {'train_loss':[], 'val_loss':[]}
all_acc = {'train_acc':[], 'val_acc':[]}

print('Epoch')
for epoch in tqdm(range(epochs)):
  print()
  print('Training: ')
  train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)
  train_acc = accuracy_score(train_labels, train_predict)

  print('Validation: ')
  val_labels, predictions, val_loss = validation(val_dataloader, device)
  val_acc = accuracy_score(val_labels, predictions)

  print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_acc, val_acc))
  print()

  all_loss['train_loss'].append(train_loss)
  all_loss['val_loss'].append(val_loss)
  all_acc['train_acc'].append(train_acc)
  all_acc['val_acc'].append(val_acc)

Epoch


  0%|          | 0/5 [00:00<?, ?it/s]


Training: 


  0%|          | 0/147 [00:00<?, ?it/s]

Validation: 


  0%|          | 0/37 [00:00<?, ?it/s]

  train_loss: 1.02373 - val_loss: 0.90290 - train_acc: 0.52108 - valid_acc: 0.60051


Training: 


  0%|          | 0/147 [00:00<?, ?it/s]

Validation: 


  0%|          | 0/37 [00:00<?, ?it/s]

  train_loss: 0.88932 - val_loss: 0.85138 - train_acc: 0.60026 - valid_acc: 0.60051


Training: 


  0%|          | 0/147 [00:00<?, ?it/s]

Validation: 


  0%|          | 0/37 [00:00<?, ?it/s]

  train_loss: 0.81861 - val_loss: 0.76567 - train_acc: 0.63193 - valid_acc: 0.66724


Training: 


  0%|          | 0/147 [00:00<?, ?it/s]

Validation: 


  0%|          | 0/37 [00:00<?, ?it/s]

  train_loss: 0.74295 - val_loss: 0.70478 - train_acc: 0.68093 - valid_acc: 0.69204


Training: 


  0%|          | 0/147 [00:00<?, ?it/s]

Validation: 


  0%|          | 0/37 [00:00<?, ?it/s]

  train_loss: 0.68771 - val_loss: 0.67824 - train_acc: 0.70041 - valid_acc: 0.69803



In [7]:
test = pd.read_csv('test.csv')
test.drop('Unnamed: 0', axis = 1, inplace = True)

In [8]:
test['Pred'] = predictions
encoded_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}
test['Pred'] = test.Pred.map(encoded_dict)

In [9]:
test.sample(10)

Unnamed: 0,Sentence,Sentiment,Pred
383,The company had net sales of EUR 19.8 mn and a...,neutral,neutral
721,The company may at any time have in its posses...,neutral,neutral
486,"A purchase agreement for 7,200 tons of gasolin...",positive,neutral
950,Scanfil expects net sales in 2008 to remain at...,neutral,neutral
509,The second variant offers complete final finis...,neutral,neutral
552,"Revenue for the quarter totaled 27.4 billion ,...",negative,neutral
925,` It is a testament to the quality of our LTE ...,neutral,positive
604,Joint procurement will be later extended to th...,neutral,neutral
560,Finnish power supply solutions and systems pro...,neutral,positive
553,An international conference call and audio web...,neutral,neutral
