<a href="https://colab.research.google.com/github/erik-koynov/sentimentanalysistask/blob/master/Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!cd /content/drive/MyDrive/sentimentanalysistask; git pull;

Mounted at /content/drive
Already up to date.


In [2]:
import os
os.chdir("/content/drive/MyDrive/sentimentanalysistask")
os.getcwd()

'/content/drive/MyDrive/sentimentanalysistask'

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.27.1


In [4]:
from dataset import Dataset4Pandas
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from model import Model
import torch.nn as nn
from torch.optim import AdamW
from tqdm import tqdm
import torch
import numpy as np
from sklearn.metrics import classification_report

## Load data

In [5]:
data = pd.read_csv("./data/training_with_cluster_lbls.csv")

In [6]:
unclustered = data[data["augmentation_cluster_lbl"]==-1]

In [7]:
clustered = data[data["augmentation_cluster_lbl"]!=-1]

In [8]:
unique_clusters = clustered["company_cluster_lbl"].unique()

In [9]:
train_clusters, test_clusters = train_test_split(unique_clusters, random_state = 42,test_size = 0.2)

In [10]:
train_unclustered, test_unclustered = train_test_split(unclustered, random_state=42, stratify= unclustered.label,test_size = 0.2)

In [11]:
train_clustered = clustered[clustered["company_cluster_lbl"].isin(train_clusters)]

In [12]:
test_clustered = clustered[clustered["company_cluster_lbl"].isin(test_clusters)]

In [13]:
train = pd.concat([train_clustered, train_unclustered])

In [14]:
test = pd.concat([test_clustered, test_unclustered])

## Create Dataset Objects

In [15]:
train_ds = Dataset4Pandas(train, 
                     text_column="text", 
                     label_column="encoded_label",
                     company_column="company")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
test_ds = Dataset4Pandas(test, 
                     text_column="text", 
                     label_column="encoded_label",
                     company_column="company")

In [18]:
dataloader_train = DataLoader(train_ds, collate_fn=Dataset4Pandas.collate_fn, batch_size=8, shuffle=True)

In [19]:
dataloader_test = DataLoader(test_ds, collate_fn=Dataset4Pandas.collate_fn, batch_size=8, shuffle=False)

## Train

In [20]:
device = "cuda"

In [21]:
model = Model(companies_list=list(train.company.unique())).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
loss = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr = 3.5e-5)

In [23]:
epochs = 2

In [24]:
for epoch in range(1, epochs+1):
    pbar_train = tqdm(enumerate(dataloader_train), total = len(dataloader_train))
    pbar_train.set_description(f"training epoch: {epoch}/{epochs}")
    
    losses_in_epoch = []
    acc_in_epoch = []
    losses_in_epoch_test = []
    acc_in_epoch_test = []
    model.train()
    for i, (texts, labels, company_names) in pbar_train:
        labels = labels.to(device)
        texts = texts.to(device)
        
        preds = model(texts, company_names)
        loss_ = loss(preds, labels)
        
        optimizer.zero_grad()
        loss_.backward()
        optimizer.step()
        losses_in_epoch.append(loss_.item())
        
        acc = (preds.argmax(1)==labels).type(torch.float32).mean()
        acc_in_epoch.append(acc.item())
        pbar_train.set_postfix(loss = sum(losses_in_epoch)/(i+1), acc = sum(acc_in_epoch)/(i+1))
    model.eval()
    pbar_test = tqdm(enumerate(dataloader_test), total = len(dataloader_test), leave = True)
    pbar_test.set_description(f"testing epoch: {epoch}/{epochs}")
    for i, (texts, labels, company_names) in pbar_test:
        labels = labels.to(device)
        texts = texts.to(device)
        
        preds = model(texts, company_names)
        loss_ = loss(preds, labels)
  
        losses_in_epoch_test.append(loss_.item())
        
        acc = (preds.argmax(1)==labels).type(torch.float32).mean()
        acc_in_epoch_test.append(acc.item())
        
        pbar_test.set_postfix(loss = sum(losses_in_epoch_test)/(i+1), acc = sum(acc_in_epoch_test)/(i+1))
    torch.save(model.state_dict(), f"saved_checkpoint_{epoch}.ckpt")


training epoch: 1/2: 100%|██████████| 7384/7384 [17:24<00:00,  7.07it/s, acc=0.654, loss=0.866]
testing epoch: 1/2: 100%|██████████| 1866/1866 [00:55<00:00, 33.61it/s, acc=0.625, loss=1.12]
training epoch: 2/2: 100%|██████████| 7384/7384 [17:18<00:00,  7.11it/s, acc=0.893, loss=0.306]
testing epoch: 2/2: 100%|██████████| 1866/1866 [00:57<00:00, 32.60it/s, acc=0.667, loss=1.2]


## Test

In [25]:
val = pd.read_csv("./data/validation_with_encoded_lbl.csv").dropna()

In [26]:
val_dataset = Dataset4Pandas(val,
                             text_column="text", 
                             label_column="encoded_label",
                             company_column="company")

In [27]:
val_dataloader =  DataLoader(val_dataset, collate_fn=Dataset4Pandas.collate_fn, batch_size=8, shuffle=False)

In [28]:
pbar_val = tqdm(enumerate(val_dataloader), total = len(val_dataloader), leave = True)
pbar_val.set_description(f"testing epoch: {epoch}/{epochs}")
predictions = []
val_loss = []
val_acc = []
model.eval()
for i, (texts, labels, company_names) in pbar_val:
    labels = labels.to(device)
    texts = texts.to(device)

    preds = model(texts, company_names)
    loss_ = loss(preds, labels)

    val_loss.append(loss_.item())
    preds = preds.argmax(1).detach()#.cpu().numpy()
    acc = (preds==labels).type(torch.float32).mean()
    val_acc.append(acc.item())
    predictions.append(preds.cpu().numpy())
    pbar_val.set_postfix(loss = sum(val_loss)/(i+1), acc = sum(val_acc)/(i+1))

testing epoch: 2/2: 100%|██████████| 125/125 [00:07<00:00, 17.25it/s, acc=0.916, loss=0.319]


In [29]:
pd.DataFrame(classification_report(val_dataset.dataframe.encoded_label.values, np.hstack(predictions), output_dict=True))\
                    .rename(columns = {"0": 'Negative', "1": 'Neutral', "2": 'Positive', "3": 'Irrelevant'})

Unnamed: 0,Negative,Neutral,Positive,Irrelevant,accuracy,macro avg,weighted avg
precision,0.94697,0.914591,0.886207,0.921212,0.916,0.917245,0.91648
recall,0.93985,0.901754,0.927798,0.883721,0.916,0.913281,0.916
f1-score,0.943396,0.908127,0.906526,0.902077,0.916,0.915032,0.916025
support,266.0,285.0,277.0,172.0,0.916,1000.0,1000.0


In [42]:
list(model.aggregation_dict.MaddenNFL.parameters())

[Parameter containing:
 tensor([[1.0008, 0.9970, 0.9894, 0.9894]], device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([[0.9777, 0.9722, 0.9697, 0.9724]], device='cuda:0', requires_grad=True)]