In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm  # Import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
train_data = pd.read_csv('merged_datatrain.csv')  # Replace with the actual path to your dataset

In [3]:
print(train_data.columns)

Index(['Unnamed: 0', 'Merged_Info', 'Entity_Value'], dtype='object')


In [4]:
# Preprocess the dataset
def preprocess_data(data):
    data['input_text'] = 'merged_info: ' + data['merged_info'].astype(str)
    data['target_text'] = data['entity_value'].astype(str)
    return data[['input_text', 'target_text']]

# Split data into train and validation sets
train_df, val_df = train_test_split(train_data, test_size=0.1, random_state=42)

# Define custom dataset
class OCRDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_max_len=512, target_max_len=20):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.source_max_len = source_max_len
        self.target_max_len = target_max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        source_text = str(self.dataframe.iloc[index, 0])
        target_text = str(self.dataframe.iloc[index, 1])

        source = self.tokenizer.encode_plus(
            source_text,
            max_length=self.source_max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        target = self.tokenizer.encode_plus(
            target_text,
            max_length=self.target_max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'source_ids': source['input_ids'].squeeze(),
            'source_mask': source['attention_mask'].squeeze(),
            'target_ids': target['input_ids'].squeeze(),
            'target_mask': target['attention_mask'].squeeze()
        }

# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")


spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.33MB/s]
tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 1.43MB/s]
config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 509kB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as expla

In [5]:
# Create data loaders
def create_data_loader(dataframe, tokenizer, batch_size=8, max_len=512, target_max_len=128):
    dataset = OCRDataset(dataframe, tokenizer, max_len, target_max_len)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_loader = create_data_loader(train_df, tokenizer, batch_size=8)
val_loader = create_data_loader(val_df, tokenizer, batch_size=8)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Define training and validation function with tqdm
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    losses = 0
    for batch in tqdm(data_loader, desc="Training", leave=False):
        input_ids = batch['source_ids'].to(device)
        attention_mask = batch['source_mask'].to(device)
        labels = batch['target_ids'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()

        losses += loss.item()

    return losses / len(data_loader)

def eval_epoch(model, data_loader, device):
    model = model.eval()
    losses = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            input_ids = batch['source_ids'].to(device)
            attention_mask = batch['source_mask'].to(device)
            labels = batch['target_ids'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            losses += loss.item()

    return losses / len(data_loader)



In [6]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

epochs = 1  # You can adjust the number of epochs

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss = eval_epoch(model, val_loader, device)

    print(f"Train loss: {train_loss:.4f}")
    print(f"Validation loss: {val_loss:.4f}")

# Save the model
model.save_pretrained('t5_ocr_model')
tokenizer.save_pretrained('t5_ocr_tokenizer')

print("Model and tokenizer saved.")

Epoch 1/1


                                                                   

Train loss: 1.7997
Validation loss: 1.6359
Model and tokenizer saved.


In [7]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the saved model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5_ocr_model')
tokenizer = T5Tokenizer.from_pretrained('t5_ocr_tokenizer')

# Load the model onto the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# Load the test data
test_data = pd.read_csv('/home/ai23mtech14008/Amazon ML Challenge/Code/merged_datatest.csv')  # Replace with the actual path to your test dataset

In [11]:
# Function to predict entity values
def predict_entity_value(input_text, model, tokenizer, device, max_len=512, target_max_len=128):
    model.eval()

    # Tokenize the input text
    inputs = tokenizer.encode_plus(
        input_text,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=target_max_len,
            num_beams=5,
            early_stopping=True
        )

    # Decode the predictions
    predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_text

In [15]:
from torch.nn import DataParallel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1 = DataParallel(model, device_ids=[0, 1, 2])
model1.to(device)

DataParallel(
  (module): T5ForConditionalGeneration(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_featu

In [17]:
# Predict entity values for the test data

# Ensure there are no NaN values in the 'Merged_Info' column
test_data['Merged_Info'] = test_data['Merged_Info'].fillna('')

# Apply your function with a progress bar
tqdm.pandas()
test_data['entity_value'] = test_data['Merged_Info'].progress_apply(
    lambda x: predict_entity_value(x, model1.module, tokenizer, device)
)

  0%|          | 0/131187 [00:00<?, ?it/s]

  0%|          | 182/131187 [05:08<61:36:03,  1.69s/it]


KeyboardInterrupt: 

In [None]:

# Save the results to a new CSV file
test_data.to_csv('test_with_predictions.csv', index=False)

print("Predictions saved to 'test_with_predictions.csv'.")