In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch 
import transformers
from collections import Counter

In [None]:
torch.__version__

In [None]:
torch.cuda.is_available()

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

In [None]:
import accelerate
from accelerate import Accelerator

In [None]:
from transformers import T5Tokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

In [None]:
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter, MarkdownHeaderTextSplitter
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_teddynote.retrievers import KiwiBM25Retriever
from langchain.retrievers import EnsembleRetriever, MultiQueryRetriever
from langchain.document_loaders import PDFPlumberLoader, PyMuPDFLoader, PyPDFLoader, UnstructuredPDFLoader

import peft
from peft import PeftModel

In [None]:
import datasets
from datasets import Dataset
from transformers import Trainer, TrainingArguments

In [None]:
from transformers import T5ForTokenClassification
from torch import nn

In [None]:
model = T5ForTokenClassification.from_pretrained(
    pretrained_model_name_or_path = 't5-small'
)

In [None]:
epochs = 5
batch_size = 16
tokenizer = T5ForTokenClassification.from_pretrained(
    pretrained_model_name_or_path= 't5-small'
)


In [None]:
data = pd.read_csv("train.csv")

In [None]:
data

In [None]:
df = data['first_party'] + '[SEP]' + data['second_party'] +'[SEP]' + data['facts']

In [None]:
df

In [None]:
type(df)

In [None]:
df = pd.DataFrame(df)
df = pd.concat([df, data['first_party_winner']], axis=1)

In [None]:
df.columns = ['infos', 'label']

In [None]:
df

In [None]:
df['label'] = df['label'].astype(str)

In [None]:
df

In [None]:
def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds.cpu().numpy(), axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
df.iloc[:, 1].values

In [None]:
def make_dataset(data, tokenizer, device):
    combined_text = (data['first_party'] + " [SEP] " + data['second_party'] + " [SEP] " + data['facts']).tolist()

    tokenizer = tokenizer(
        combined_text,
        padding = 'longest',
        truncation = True,
        return_tensors = 'pt'
    )
    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']
    labels = torch.tensor(data.iloc[:, 1].values, dtype = torch.long)
    return TensorDataset(input_ids, attention_mask, labels)

In [None]:
help(tokenizer)

In [None]:
print(outputs.logits.shape) 
print(labels.shape)   

In [None]:
data

In [None]:
df.iloc[:, -1] = df.iloc[:, -1].replace('positive', 1).replace('negative', 0)

In [None]:
df

In [None]:
target = data.iloc[:, -1].astype(str)

In [None]:
device = 'cpu'

In [None]:
data.iloc[:, -1].astype(str).values

In [None]:
target = data.iloc[:, -1].astype(str).values

In [None]:
def make_dataset(data, tokenizer, device):
    source = tokenizer(
        text = data.infos.tolist(),
        padding='max_length',
        max_length=128,
        pad_to_max_length=True,
        truncation=True,
        return_tensors='pt'
    )
    
    # target = tokenizer(
    #     text = data.label.tolist(),
    #     padding='max_length',
    #     max_length=128,
    #     pad_to_max_length= True,
    #     truncation = True,
    #     return_tensors = 'pt'
    # )

    # data.iloc[:, -1] = data.iloc[:, -1].astype(str)
    # data.iloc[:, -1] = data.iloc[:, -1].replace('1', 'positive').replace('0', 'negative')

    # target = tokenizer(
    #     text=data.label.tolist(),
    #     padding='max_length',
    #     max_length=128,
    #     truncation=True,
    #     return_tensors='pt'
    # )


    target = data.label.astype(int).values
    
    input_ids = source['input_ids'].squeeze().to(device)
    attention_mask = source['attention_mask'].squeeze().to(device)
    # labels = target['input_ids'].to(device)
    labels = torch.tensor(target).to(device)
    
    return TensorDataset(input_ids, attention_mask, labels)

In [None]:
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path= 't5-small'
)

In [None]:
df

In [None]:
make_dataset(df, tokenizer, device='cpu')

In [None]:
def get_dataloader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler = data_sampler, batch_size = batch_size)
    return dataloader

In [None]:
train_data, valid, test = np.split(
    df.sample(frac=1, random_state=42), [int(0.6*len(df)), int(0.8*len(df))]
)

In [None]:
train_dataset = make_dataset(train_data, tokenizer, device='cpu')
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size)

valid_dataset = make_dataset(valid, tokenizer, device='cpu')
valid_dataloader = get_dataloader(valid_dataset, RandomSampler, batch_size)

test_dataset = make_dataset(test, tokenizer, device='cpu')
test_dataloader = get_dataloader(test_dataset, RandomSampler, batch_size)

In [None]:
from torch import optim

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-5, eps=1e-8)

In [None]:
def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0
    
    for input_ids, attention_mask, labels in dataloader:
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels
        )
        
        loss = outputs.loss
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    train_loss = train_loss / len(dataloader)
    return train_loss

In [None]:
model

In [None]:
def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0

    for input_ids, attention_mask, labels in dataloader:
        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        loss = loss.
        
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    
    return train_loss / len(dataloader)

In [None]:
import torch
import torch.nn.functional as F

def train(model, optimizer, dataloader):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        input_ids = batch[0]
        attention_mask = batch[1]
        labels = batch[2]
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        # Compute loss
        logits = outputs.logits
        
        # Reshape logits and labels for CrossEntropyLoss
        logits = logits.view(-1, logits.size(-1))  # Shape: (batch_size * sequence_length, num_labels)
        labels = labels.view(-1)                    # Shape: (batch_size * sequence_length)
        
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [None]:

def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        criterion = nn.CrossEntropyLoss()
        val_loss, val_accuracy = 0.0, 0.0
        
        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                labels = labels
            )
            logits = outputs.logits
            
            loss = criterion(logits, labels)
            logtis = logits.detach().cpu().numpy()
            labels_ids = labels.to('cpu').numpy()
            accuracy = calc_accuracy(logits, labels_ids)
            
            val_loss += loss
            val_accuracy += accuracy
            
        val_loss = val_loss/len(dataloader)
        val_accuracy = val_accuracy / len(dataloader)
        return val_loss, val_accuracy

In [None]:
16*128

In [None]:
for batch in train_dataloader:
    print(type(batch))
    print(batch)
    break

In [None]:
best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f'Epoch: {epoch+1} train loss: {train_loss:.4f} val loss: {val_loss:.4f} val_acc: {val_accuracy:.4f}')
    
    if val_loss < best_loss:
        best_loss = val_loss

In [None]:
train_dataset[0]

In [None]:
train_dataset[3]

In [None]:
len(train_dataset[3][0])

In [None]:
train_data

In [None]:
train_data.loc[209, 'infos']