In [None]:
!pip install transformers



In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = '0'  # Disable the automatic allocator
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Enable synchronous GPU execution for error debugging
os.environ['NCCL_DEBUG'] = 'INFO'  # Enable NCCL debug information for error debugging
os.environ['CUDA_MAX_SPLIT_SIZE_MB'] = '16'  # Set max_split_size_mb to a lower value


In [None]:
import os
os.environ['TORCH_CUDA_ALLOC_CONF'] = '0'

In [None]:
!pip uninstall torch torchvision
!pip install torch torchvision

Found existing installation: torch 2.0.1+cu118
Uninstalling torch-2.0.1+cu118:
  Would remove:
    /usr/local/bin/convert-caffe2-to-onnx
    /usr/local/bin/convert-onnx-to-caffe2
    /usr/local/bin/torchrun
    /usr/local/lib/python3.10/dist-packages/functorch/*
    /usr/local/lib/python3.10/dist-packages/nvfuser/*
    /usr/local/lib/python3.10/dist-packages/torch-2.0.1+cu118.dist-info/*
    /usr/local/lib/python3.10/dist-packages/torch/*
    /usr/local/lib/python3.10/dist-packages/torchgen/*
Proceed (Y/n)? Y
Y
Y
Y
  Successfully uninstalled torch-2.0.1+cu118
[31mERROR: Operation cancelled by user[0m[31m
[0mCollecting torch
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertModel
import numpy as np

# Load CSV data into pandas DataFrame
data3 = pd.read_csv("/content/train_v2.csv", sep='\t')

# Preprocessing and tokenization using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=True)

def preprocess_and_tokenize(text):
    return tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=128)

data3['tokenized_text'] = data3['text'].apply(preprocess_and_tokenize)

# Padding
max_length = max(data3['tokenized_text'].apply(len))
data3['padded_tokenized_text'] = data3['tokenized_text'].apply(lambda x: x + [0] * (max_length - len(x)))

# Convert to tensors
input_ids = torch.tensor(data3['padded_tokenized_text'].tolist())
attention_masks = (input_ids != 0).float()

label_mapping = {'HOF': 1, 'NOT': 0}  # Replace with your label mapping
data3['label'] = data3['label'].map(label_mapping)
labels = torch.tensor(data3['label'].tolist(), dtype=torch.long)

# Train-Test Split
train_inputs, test_inputs, train_masks, test_masks, train_labels, test_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=42
)

# Create DataLoader
batch_size = 4
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Load pre-trained BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = BertModel.from_pretrained('bert-base-cased').to(device)

# Get BERT embeddings
def get_bert_embeddings(input_ids, attention_masks):
    with torch.no_grad():
        inputs = input_ids.to(device)
        masks = attention_masks.to(device)
        outputs = bert_model(inputs, attention_mask=masks)
        embeddings = torch.mean(outputs.last_hidden_state, dim=1) # Global average pooling
        torch.cuda.empty_cache()
    return embeddings

# Calculate BERT embeddings
train_embeddings = get_bert_embeddings(train_inputs, train_masks)
test_embeddings = get_bert_embeddings(test_inputs, test_masks)

class SimpleClassifier(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleClassifier, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Instantiate the classifier model
input_dim = 768  # Correct input dimension for BERT embeddings
hidden_dim = 64
output_dim = len(data3['label'].unique())
model = SimpleClassifier(input_dim, hidden_dim, output_dim)

# Convert the model's parameters to the appropriate data type
model = model.to(torch.float32)

# Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)

# Train the model
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for embeddings_batch, masks_batch, labels_batch in train_loader:
        optimizer.zero_grad()
        embeddings_batch = get_bert_embeddings(embeddings_batch, masks_batch)
        embeddings_batch = embeddings_batch.view(embeddings_batch.size(0), -1)  # Reshape
        outputs = model(embeddings_batch)
        loss = criterion(outputs, labels_batch)
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
with torch.no_grad():
    test_embeddings = get_bert_embeddings(test_inputs, test_masks)
    test_embeddings = test_embeddings.view(test_embeddings.size(0), -1)  # Reshape
    test_outputs = model(test_embeddings)
    _, predicted_labels = torch.max(test_outputs, 1)

# Calculate metrics
precision = precision_score(test_labels.numpy(), predicted_labels.numpy(), average='weighted')
recall = recall_score(test_labels.numpy(), predicted_labels.numpy(), average='weighted')
f1 = f1_score(test_labels.numpy(), predicted_labels.numpy(), average='weighted')

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')