In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch 
import transformers
from collections import Counter

In [2]:
torch.__version__

'2.4.1'

In [3]:
torch.cuda.is_available()

False

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

In [5]:
import accelerate
from accelerate import Accelerator

In [6]:
from transformers import T5Tokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

In [7]:
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter, MarkdownHeaderTextSplitter
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_teddynote.retrievers import KiwiBM25Retriever
from langchain.retrievers import EnsembleRetriever, MultiQueryRetriever
from langchain.document_loaders import PDFPlumberLoader, PyMuPDFLoader, PyPDFLoader, UnstructuredPDFLoader

import peft
from peft import PeftModel



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from .kiwi_bm25 import KiwiBM25Retriever


In [8]:
import datasets
from datasets import Dataset
from transformers import Trainer, TrainingArguments

In [9]:
from transformers import T5ForTokenClassification
from torch import nn

In [10]:
model = T5ForTokenClassification.from_pretrained(
    pretrained_model_name_or_path = 't5-small'
)

Some weights of T5ForTokenClassification were not initialized from the model checkpoint at t5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
epochs = 5
batch_size = 16
tokenizer = T5ForTokenClassification.from_pretrained(
    pretrained_model_name_or_path= 't5-small'
)


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at t5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
data = pd.read_csv("train.csv")

In [13]:
data

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [14]:
df = data['first_party'] + '[SEP]' + data['second_party'] +'[SEP]' + data['facts']

In [15]:
df

0       Phil A. St. Amant[SEP]Herman A. Thompson[SEP]O...
1       Stephen Duncan[SEP]Lawrence Owens[SEP]Ramon Ne...
2       Billy Joe Magwood[SEP]Tony Patterson, Warden, ...
3       Linkletter[SEP]Walker[SEP]Victor Linkletter wa...
4       William Earl Fikes[SEP]Alabama[SEP]On April 24...
                              ...                        
2473    HollyFrontier Cheyenne Refining, LLC, et al.[S...
2474    Grupo Mexicano de Desarrollo, S. A.[SEP]Allian...
2475    Peguero[SEP]United States[SEP]In 1992, the Dis...
2476    Immigration and Naturalization Service[SEP]St....
2477    Markman[SEP]Westview Instruments, Inc.[SEP]Her...
Length: 2478, dtype: object

In [16]:
type(df)

pandas.core.series.Series

In [17]:
df = pd.DataFrame(df)
df = pd.concat([df, data['first_party_winner']], axis=1)

In [18]:
df.columns = ['infos', 'label']

In [19]:
df

Unnamed: 0,infos,label
0,Phil A. St. Amant[SEP]Herman A. Thompson[SEP]O...,1
1,Stephen Duncan[SEP]Lawrence Owens[SEP]Ramon Ne...,0
2,"Billy Joe Magwood[SEP]Tony Patterson, Warden, ...",1
3,Linkletter[SEP]Walker[SEP]Victor Linkletter wa...,0
4,William Earl Fikes[SEP]Alabama[SEP]On April 24...,1
...,...,...
2473,"HollyFrontier Cheyenne Refining, LLC, et al.[S...",1
2474,"Grupo Mexicano de Desarrollo, S. A.[SEP]Allian...",1
2475,"Peguero[SEP]United States[SEP]In 1992, the Dis...",0
2476,Immigration and Naturalization Service[SEP]St....,0


In [20]:
df['label'] = df['label'].astype(str)

In [21]:
df

Unnamed: 0,infos,label
0,Phil A. St. Amant[SEP]Herman A. Thompson[SEP]O...,1
1,Stephen Duncan[SEP]Lawrence Owens[SEP]Ramon Ne...,0
2,"Billy Joe Magwood[SEP]Tony Patterson, Warden, ...",1
3,Linkletter[SEP]Walker[SEP]Victor Linkletter wa...,0
4,William Earl Fikes[SEP]Alabama[SEP]On April 24...,1
...,...,...
2473,"HollyFrontier Cheyenne Refining, LLC, et al.[S...",1
2474,"Grupo Mexicano de Desarrollo, S. A.[SEP]Allian...",1
2475,"Peguero[SEP]United States[SEP]In 1992, the Dis...",0
2476,Immigration and Naturalization Service[SEP]St....,0


In [22]:
def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds.cpu().numpy(), axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [23]:
df.iloc[:, 1].values

array(['1', '0', '1', ..., '0', '0', '0'], dtype=object)

In [215]:
def make_dataset(data, tokenizer, device):
    combined_text = (data['first_party'] + " [SEP] " + data['second_party'] + " [SEP] " + data['facts']).tolist()

    tokenizer = tokenizer(
        combined_text,
        padding = 'longest',
        truncation = True,
        return_tensors = 'pt'
    )
    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']
    labels = torch.tensor(data.iloc[:, 1].values, dtype = torch.long)
    return TensorDataset(input_ids, attention_mask, labels)

In [216]:
help(tokenizer)

Help on T5Tokenizer in module transformers.models.t5.tokenization_t5 object:

class T5Tokenizer(transformers.tokenization_utils.PreTrainedTokenizer)
 |  T5Tokenizer(vocab_file, eos_token='</s>', unk_token='<unk>', pad_token='<pad>', extra_ids=100, additional_special_tokens=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, legacy=None, add_prefix_space=True, **kwargs) -> None
 |
 |  Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 |
 |  This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
 |  this superclass for more information regarding those methods.
 |
 |  Args:
 |      vocab_file (`str`):
 |          [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
 |          contains the vocabulary necessary to instantiate a tokenizer.
 |      eos_token (`str`, *optional*, defaults to `"</s>"`):
 |          The end of sequence token

In [247]:
print(outputs.logits.shape) 
print(labels.shape)   

torch.Size([14, 128, 2])
torch.Size([14, 128])


In [217]:
data

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [222]:
df.iloc[:, -1] = df.iloc[:, -1].replace('positive', 1).replace('negative', 0)

  df.iloc[:, -1] = df.iloc[:, -1].replace('positive', 1).replace('negative', 0)


In [223]:
df

Unnamed: 0,infos,label
0,Phil A. St. Amant[SEP]Herman A. Thompson[SEP]O...,1
1,Stephen Duncan[SEP]Lawrence Owens[SEP]Ramon Ne...,0
2,"Billy Joe Magwood[SEP]Tony Patterson, Warden, ...",1
3,Linkletter[SEP]Walker[SEP]Victor Linkletter wa...,0
4,William Earl Fikes[SEP]Alabama[SEP]On April 24...,1
...,...,...
2473,"HollyFrontier Cheyenne Refining, LLC, et al.[S...",1
2474,"Grupo Mexicano de Desarrollo, S. A.[SEP]Allian...",1
2475,"Peguero[SEP]United States[SEP]In 1992, the Dis...",0
2476,Immigration and Naturalization Service[SEP]St....,0


In [224]:
target = data.iloc[:, -1].astype(str)

In [225]:
device = 'cpu'

In [226]:
data.iloc[:, -1].astype(str).values

array(['1', '0', '1', ..., '0', '0', '0'], dtype=object)

In [227]:
target = data.iloc[:, -1].astype(str).values

In [228]:
def make_dataset(data, tokenizer, device):
    source = tokenizer(
        text = data.infos.tolist(),
        padding='max_length',
        max_length=128,
        pad_to_max_length=True,
        truncation=True,
        return_tensors='pt'
    )
    
    # target = tokenizer(
    #     text = data.label.tolist(),
    #     padding='max_length',
    #     max_length=128,
    #     pad_to_max_length= True,
    #     truncation = True,
    #     return_tensors = 'pt'
    # )

    # data.iloc[:, -1] = data.iloc[:, -1].astype(str)
    # data.iloc[:, -1] = data.iloc[:, -1].replace('1', 'positive').replace('0', 'negative')

    # target = tokenizer(
    #     text=data.label.tolist(),
    #     padding='max_length',
    #     max_length=128,
    #     truncation=True,
    #     return_tensors='pt'
    # )


    target = data.label.astype(int).values
    
    input_ids = source['input_ids'].squeeze().to(device)
    attention_mask = source['attention_mask'].squeeze().to(device)
    # labels = target['input_ids'].to(device)
    labels = torch.tensor(target).to(device)
    
    return TensorDataset(input_ids, attention_mask, labels)

In [229]:
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path= 't5-small'
)

In [230]:
df

Unnamed: 0,infos,label
0,Phil A. St. Amant[SEP]Herman A. Thompson[SEP]O...,1
1,Stephen Duncan[SEP]Lawrence Owens[SEP]Ramon Ne...,0
2,"Billy Joe Magwood[SEP]Tony Patterson, Warden, ...",1
3,Linkletter[SEP]Walker[SEP]Victor Linkletter wa...,0
4,William Earl Fikes[SEP]Alabama[SEP]On April 24...,1
...,...,...
2473,"HollyFrontier Cheyenne Refining, LLC, et al.[S...",1
2474,"Grupo Mexicano de Desarrollo, S. A.[SEP]Allian...",1
2475,"Peguero[SEP]United States[SEP]In 1992, the Dis...",0
2476,Immigration and Naturalization Service[SEP]St....,0


In [231]:
make_dataset(df, tokenizer, device='cpu')

<torch.utils.data.dataset.TensorDataset at 0x1b1ba809490>

In [232]:
def get_dataloader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler = data_sampler, batch_size = batch_size)
    return dataloader

In [233]:
train_data, valid, test = np.split(
    df.sample(frac=1, random_state=42), [int(0.6*len(df)), int(0.8*len(df))]
)

  return bound(*args, **kwds)


In [234]:
train_dataset = make_dataset(train_data, tokenizer, device='cpu')
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size)

valid_dataset = make_dataset(valid, tokenizer, device='cpu')
valid_dataloader = get_dataloader(valid_dataset, RandomSampler, batch_size)

test_dataset = make_dataset(test, tokenizer, device='cpu')
test_dataloader = get_dataloader(test_dataset, RandomSampler, batch_size)

In [235]:
from torch import optim

In [236]:
optimizer = optim.Adam(model.parameters(), lr=1e-5, eps=1e-8)

In [237]:
def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0
    
    for input_ids, attention_mask, labels in dataloader:
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels
        )
        
        loss = outputs.loss
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    train_loss = train_loss / len(dataloader)
    return train_loss

In [250]:
model

T5ForTokenClassification(
  (transformer): T5EncoderModel(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_f

In [242]:
def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0

    for input_ids, attention_mask, labels in dataloader:
        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        loss = loss.
        
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    
    return train_loss / len(dataloader)

In [256]:
import torch
import torch.nn.functional as F

def train(model, optimizer, dataloader):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        input_ids = batch[0]
        attention_mask = batch[1]
        labels = batch[2]
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        # Compute loss
        logits = outputs.logits
        
        # Reshape logits and labels for CrossEntropyLoss
        logits = logits.view(-1, logits.size(-1))  # Shape: (batch_size * sequence_length, num_labels)
        labels = labels.view(-1)                    # Shape: (batch_size * sequence_length)
        
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [257]:

def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        criterion = nn.CrossEntropyLoss()
        val_loss, val_accuracy = 0.0, 0.0
        
        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                labels = labels
            )
            logits = outputs.logits
            
            loss = criterion(logits, labels)
            logtis = logits.detach().cpu().numpy()
            labels_ids = labels.to('cpu').numpy()
            accuracy = calc_accuracy(logits, labels_ids)
            
            val_loss += loss
            val_accuracy += accuracy
            
        val_loss = val_loss/len(dataloader)
        val_accuracy = val_accuracy / len(dataloader)
        return val_loss, val_accuracy

In [253]:
16*128

2048

In [258]:
for batch in train_dataloader:
    print(type(batch))
    print(batch)
    break

<class 'list'>
[tensor([[ 6964,   157,  4173,  ...,     7,  4956,     1],
        [  695, 21919,  5083,  ..., 21086,  8395,     1],
        [ 4117,    71,     5,  ...,    37,   907,     1],
        ...,
        [ 2158,    15,  6306,  ...,   715,   120,     1],
        [ 5049,    35,    15,  ...,  6775,    13,     1],
        [21269,    77,   739,  ...,     7,  1357,     1]]), tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), tensor([0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1], dtype=torch.int32)]


In [260]:
best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f'Epoch: {epoch+1} train loss: {train_loss:.4f} val loss: {val_loss:.4f} val_acc: {val_accuracy:.4f}')
    
    if val_loss < best_loss:
        best_loss = val_loss

ValueError: Expected input batch_size (2048) to match target batch_size (16).

In [240]:
train_dataset[0]

(tensor([28431,   262,     5,  1140, 12019,  6306,   134,  8569,   908,   517,
             9,    63,    15,   283,     5,  1140, 12019,  6306,   134,  8569,
           908,  9236, 28431,   262,     5,  1140, 12019,    11, 20338,    15,
           283,     5,  1140, 12019,   130,  4464,    21,  1902,   203,   552,
            70,  5281,  3492,    16, 15393,     5,  9236,  1140, 12019,  1204,
           321,  1761,  5205,  6576,   726,    11,     6, 19890,   288,    12,
             3,     9,  4149,    13,    24,   726,     6, 10050,  1393,     5,
          3526,     8,  5645, 10816,  1799, 18263,  8927,  1074,     7,    22,
          8009,  1983,     6,  2716,  6576,  4364,   130,  1702,   573,   785,
            24,   130,    12,    36,  8807, 18756,   344,     8,  9911,     7,
             3,   390,    30,     8,  5996,    13,  2716,   313,  3032,   383,
             8,  5281,     5,  9236,  1140, 12019,  1380,     8,  1826, 19463,
          2243,    12, 11435,     8,  7759, 28717,  

In [241]:
train_dataset[3]

(tensor([10271,  6306,   134,  8569,   908, 13601,    17,   172,  6306,   134,
          8569,   908,   634,  4506,  9836, 14451,  1983,    13, 25745,    41,
         23312,    61,  2311,   334,  6152,    12,   726,   284,    13,   112,
          1652,  5908,    16, 12794,    42,    16,     8,   999,    13,  4968,
            21, 12794,     3,     9,  2559,  1781,   120, 11458,    11,     3,
             9,  1146,  1080,    21, 19829,     3,     9,  2411,   381,    13,
           716,   399,   471,     5,    37,  1983, 19678,     8,  2822,   789,
            42,   136,   538,   789,    42,  1827, 27444,    45,     8,  4903,
            13,    96,    15,  1167, 28014,   535,    86, 21018,     6,     8,
          1983,    47, 21012,    12,   560,  1652,    13,   136,  5399,  5908,
            16, 12794,    42,   999,    13, 12794,     6,   224,    38,     8,
          2986,    13,     3,     9,  2833,    42,   136,   286,    24,   124,
             7,    21,     8,  6802,     6,     3,  

In [246]:
len(train_dataset[3][0])

128

In [171]:
train_data

Unnamed: 0,infos,label
1753,Gerald E. Mansell[SEP]Gaye M. Mansell[SEP] Maj...,1
259,Freedman[SEP]Maryland[SEP]Maryland required th...,1
2072,"BMW of North America, Inc.[SEP]Gore[SEP]After ...",1
1000,Maryland[SEP]Wirtz[SEP]The Fair Labor Standard...,0
56,Randall D. White[SEP]State of Illinois[SEP]Dur...,0
...,...,...
1402,Zenith Radio Corporation[SEP]Hazeltine Researc...,1
2018,Blessing[SEP]Freestone[SEP]Cathy Freestone and...,1
2240,United States Trust Company of New York[SEP]Ne...,1
1964,"Commissioner of Internal Revenue, Philip D. Fa...",0


In [157]:
train_data.loc[209, 'infos']

"Bogan[SEP]Scott-Harris[SEP]Janet Scott-Harris filed suit under 42 U.S.C. Section 1983 against the city of Fall River, Massachusetts, the city's mayor, Daniel Bogan, the vice president of the city counsel, Marilyn Roderick, and others, alleging that the elimination of the city department in which Scott-Harris was the sole employee was motivated by a desire to retaliate against her for exercising her First Amendment rights. The jury found the city, Bogan and Roderick liable on the First Amendment claim. The First Circuit set aside the verdict against the city, but affirmed the judgments against Bogan and Roderick. The court held that although Bogan and Roderick had absolute immunity from civil liability for their performance of legitimate legislative activities, their conduct in voting for and signing the ordinance that eliminated Scott-Harris's office was motivated by considerations relating to a particular individual and was therefore administrative rather than legislative in nature.\