In [50]:
from dataset import Dataset4Pandas
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from model import Model
import torch.nn as nn
from torch.optim import AdamW
from tqdm import tqdm
import torch
import numpy as np
from sklearn.metrics import classification_report

## Load data

In [2]:
data = pd.read_csv("/home/koynov/sentimentanalysistask/data/training_with_cluster_lbls.csv")

In [3]:
unclustered = data[data["augmentation_cluster_lbl"]==-1]

In [4]:
clustered = data[data["augmentation_cluster_lbl"]!=-1]

In [5]:
unique_clusters = clustered["company_cluster_lbl"].unique()

In [6]:
train_clusters, test_clusters = train_test_split(unique_clusters, random_state = 42,test_size = 0.2)

In [7]:
train_unclustered, test_unclustered = train_test_split(unclustered, random_state=42, stratify= unclustered.label,test_size = 0.2)

In [8]:
train_clustered = clustered[clustered["company_cluster_lbl"].isin(train_clusters)]

In [9]:
test_clustered = clustered[clustered["company_cluster_lbl"].isin(test_clusters)]

In [10]:
train = pd.concat([train_clustered, train_unclustered])

In [11]:
test = pd.concat([test_clustered, test_unclustered])

## Create Dataset Objects

In [13]:
train_ds = Dataset4Pandas(train, 
                     text_column="text", 
                     label_column="encoded_label",
                     company_column="company")

In [14]:
test_ds = Dataset4Pandas(test, 
                     text_column="text", 
                     label_column="encoded_label",
                     company_column="company")

In [15]:
dataloader_train = DataLoader(train_ds, collate_fn=Dataset4Pandas.collate_fn, batch_size=8, shuffle=True)

In [16]:
dataloader_test = DataLoader(test_ds, collate_fn=Dataset4Pandas.collate_fn, batch_size=8, shuffle=False)

## Train

In [17]:
device = "cpu"

In [18]:
model = Model(companies_list=list(train.company.unique())).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
loss = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr = 3.5e-5)

In [20]:
epochs = 3

In [32]:
for epoch in range(1, epochs+1):
    pbar_train = tqdm(enumerate(dataloader_train), total = len(dataloader_train))
    pbar_train.set_description(f"training epoch: {epoch}/{epochs}")
    
    losses_in_epoch = []
    acc_in_epoch = []
    losses_in_epoch_test = []
    acc_in_epoch_test = []
    model.train()
    for i, (texts, labels, company_names) in pbar_train:
        labels = labels.to(device)
        texts = texts.to(device)
        
        preds = model(texts, company_names)
        loss_ = loss(preds, labels)
        
        optimizer.zero_grad()
        loss_.backward()
        optimizer.step()
        losses_in_epoch.append(loss_.item())
        
        acc = (preds.argmax(1)==labels).type(torch.float32).mean()
        acc_in_epoch.append(acc.item())
        pbar_train.set_postfix(loss = sum(losses_in_epoch)/(i+1), acc = sum(acc_in_epoch)/(i+1))
    model.eval()
    pbar_test = tqdm(enumerate(dataloader_test), total = len(dataloader_test), leave = True)
    pbar_test.set_description(f"testing epoch: {epoch}/{epochs}")
    for i, (texts, labels, company_names) in pbar_test:
        labels = labels.to(device)
        texts = texts.to(device)
        
        preds = model(texts, company_names)
        loss_ = loss(preds, labels)
  
        losses_in_epoch_test.append(loss_.item())
        
        acc = (preds.argmax(1)==labels).type(torch.float32).mean()
        acc_in_epoch_test.append(acc.item())
        
        pbar_test.set_postfix(loss = sum(losses_in_epoch_test)/(i+1), acc = sum(acc_in_epoch_test)/(i+1))
    torch.save(model.state_dict(), f"saved_checkpoint_{epoch}.ckpt")


training epoch: 1/3: 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, acc=0.404, loss=1.34]
testing epoch: 1/3: 100%|██████████| 13/13 [00:09<00:00,  1.41it/s, acc=0.337, loss=1.4] 
training epoch: 2/3: 100%|██████████| 13/13 [00:30<00:00,  2.37s/it, acc=0.404, loss=1.31]
testing epoch: 2/3: 100%|██████████| 13/13 [00:08<00:00,  1.61it/s, acc=0.298, loss=1.47]
training epoch: 3/3: 100%|██████████| 13/13 [00:28<00:00,  2.23s/it, acc=0.462, loss=1.19]
testing epoch: 3/3: 100%|██████████| 13/13 [00:08<00:00,  1.59it/s, acc=0.269, loss=1.44]


## Test

In [33]:
val = pd.read_csv("./data/validation_with_encoded_lbl.csv").dropna()

In [42]:
val_dataset = Dataset4Pandas(val,
                             text_column="text", 
                             label_column="encoded_label",
                             company_column="company")

In [40]:
val_dataloader =  DataLoader(val_dataset, collate_fn=Dataset4Pandas.collate_fn, batch_size=8, shuffle=False)

In [60]:
pbar_val = tqdm(enumerate(val_dataloader), total = len(val_dataloader), leave = True)
pbar_val.set_description(f"testing epoch: {epoch}/{epochs}")
predictions = []
val_loss = []
val_acc = []
model.eval()
for i, (texts, labels, company_names) in pbar_val:
    labels = labels.to(device)
    texts = texts.to(device)

    preds = model(texts, company_names)
    loss_ = loss(preds, labels)

    val_loss.append(loss_.item())
    preds = preds.argmax(1).detach()#.cpu().numpy()
    acc = (preds==labels).type(torch.float32).mean()
    val_acc.append(acc.item())
    predictions.append(preds.cpu().numpy())
    pbar_val.set_postfix(loss = sum(val_loss)/(i+1), acc = sum(val_acc)/(i+1))

testing epoch: 3/3: 100%|██████████| 13/13 [00:12<00:00,  1.01it/s, acc=0.375, loss=1.37]


In [61]:
pd.DataFrame(classification_report(val_dataset.dataframe.encoded_label.values, np.hstack(predictions), output_dict=True))\
                    .rename(columns = {"0": 'Negative', "1": 'Neutral', "2": 'Positive', "3": 'Irrelevant'})

Unnamed: 0,Negative,Neutral,Positive,Irrelevant,accuracy,macro avg,weighted avg
precision,0.173077,0.357143,0.5,0.363636,0.28,0.348464,0.352687
recall,0.409091,0.185185,0.25,0.296296,0.28,0.285143,0.28
f1-score,0.243243,0.243902,0.333333,0.326531,0.28,0.286752,0.28753
support,22.0,27.0,24.0,27.0,0.28,100.0,100.0
