## Load CSV

In [None]:
import pandas as pd
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader, random_split

train_file = 'data/train.csv'
test_file = 'data/test.csv'
sample_file = 'data/sample.csv'

# train_df = pd.read_csv(train_file)
# test_df = pd.read_csv(test_file)
# sample_df = pd.read_csv(sample_file)

# train_df['text'] = train_df['text'].apply(str)
# test_df['text'] = test_df['text'].apply(str)
# sample_df = pd.read_csv('sample.csv')

## Dataset

In [None]:
class CustomDataset(torch.utils.data.Dataset):
  def __init__(self, df, tokenizer):
    self.tokenizer = tokenizer
    self.data = df
  
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, index):
    text = self.data['text'][index]
    label = self.data['label'][index] if 'label' in self.data.columns else None

    # Tokenize the text
    encoded_dict = self.tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
    input_ids = encoded_dict['input_ids'][0]
    attention_mask = encoded_dict['attention_mask'][0]
    labels = encoded_dict['labels'] if 'labels' in encoded_dict else None

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

## DataLoader

In [None]:
def CustomDataLoader(train_file, test_file, tokenizer, batch_size, valid_ratio):
    # load train data
    train_df = pd.read_csv(train_file)
    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True) # shuffle rows
    
    # split train data into train and validation sets
    num_train = int(len(train_df) * (1 - valid_ratio))
    train_set, valid_set = random_split(train_df, [num_train, len(train_df) - num_train])
    
    # load test data
    test_df = pd.read_csv(test_file)
    
    # create data loaders
    train_loader = DataLoader(CustomDataset(train_set, tokenizer), batch_size=batch_size, shuffle=True, pin_memory=True)
    valid_loader = DataLoader(CustomDataset(valid_set, tokenizer), batch_size=batch_size, shuffle=False, pin_memory=True)
    test_loader = DataLoader(CustomDataset(test_df, tokenizer), batch_size=batch_size, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader, test_loader

In [None]:
# Define the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

batch_size = 16
valid_ratio = 0.2

train_loader, valid_loader, test_loader = CustomDataLoader(train_file, test_file, tokenizer, batch_size, valid_ratio)