In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'max_split_size_mb:512'
# Load the data set
file_loc_train = 'sample_data/train.csv'
file_loc_test = 'sample_data/test.csv'
df = pd.read_csv(file_loc_train)
df_test = pd.read_csv(file_loc_test)

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [2]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_data(df, tokenizer, max_length=128):
    return tokenizer(
        df['text'].tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_df, tokenizer)
val_encodings = tokenize_data(val_df, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [3]:
import torch
from torch.utils.data import Dataset

class DisasterDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Convert labels to a tensor
train_labels = torch.tensor(train_df['target'].tolist())
val_labels = torch.tensor(val_df['target'].tolist())

# Create datasets
train_dataset = DisasterDataset(train_encodings, train_labels)
val_dataset = DisasterDataset(val_encodings, val_labels)

In [4]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",      # evaluate every `eval_steps` steps
    eval_steps=1000,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Step,Training Loss,Validation Loss


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


TrainOutput(global_step=762, training_loss=0.39601052808636444, metrics={'train_runtime': 307.5177, 'train_samples_per_second': 59.411, 'train_steps_per_second': 2.478, 'total_flos': 788654832890400.0, 'train_loss': 0.39601052808636444, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
#print(f"Validation Accuracy: {eval_results['eval_accuracy']}")
eval_results

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


{'eval_loss': 0.45519986748695374,
 'eval_runtime': 5.1811,
 'eval_samples_per_second': 293.955,
 'eval_steps_per_second': 12.353,
 'epoch': 3.0}

In [None]:
test = list(df_test['text'])
test_encodings = tokenizer(test, padding=True, truncation=True, max_length=128, return_tensors='pt')

# Model prediction
outputs = model(**test_encodings)
predictions = torch.argmax(outputs.logits, dim=1)

# Output prediction results
print(predictions)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
torch.cuda.is_available()

In [7]:
# Check if GPU is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Move the model to the appropriate device
model.to(device)

# Tokenize the test data in batches to reduce memory usage
batch_size = 16 # Experiment with different batch sizes
predictions = []
for i in range(0, len(df_test['text']), batch_size):
    test_batch = list(df_test['text'][i:i+batch_size])
    test_encodings = tokenizer(test_batch, padding=True, truncation=True, max_length=128, return_tensors='pt')

    # Move the input tensors to the same device as the model
    test_encodings = {key: val.to(device) for key, val in test_encodings.items()}

    # Model prediction
    with torch.no_grad(): # Use torch.no_grad() to reduce memory usage during inference
        outputs = model(**test_encodings)

    # Get the predicted classes
    batch_predictions = torch.argmax(outputs.logits, dim=1)
    predictions.extend(batch_predictions.cpu().numpy()) # Move predictions to CPU and convert to numpy array

# Output prediction results
print(predictions)

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [8]:
df_submit = pd.DataFrame({'id': df_test['id'], 'target': predictions})
df_submit.to_csv('submission_07_09_2024.csv', index=False)
df_submit.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [None]:
"""with batch = 8 eval_loss': 0.6313741207122803
   with batch = 16 eval_loss': 0.5087562203407288
   with batch = 24 'eval_loss': 0.4494378864765167
   with batch = 32 it get worse"""