In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Load data
train_df = pd.read_csv("/content/drive/MyDrive/llm workshop/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/llm workshop/test.csv")

# Preprocess data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess_text(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=64,  # Adjust this parameter to your desired maximum length
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Handle sequences longer than max_length by truncating
    if len(encoding['input_ids'][0]) > 64:
        encoding['input_ids'] = encoding['input_ids'][:, :64]
        encoding['attention_mask'] = encoding['attention_mask'][:, :64]

    return encoding


train_texts = train_df['text'].values
train_labels = train_df['target'].values
test_texts = test_df['text'].values

train_encodings = [preprocess_text(text) for text in train_texts]
test_encodings = [preprocess_text(text) for text in test_texts]

train_input_ids = torch.cat([enc['input_ids'] for enc in train_encodings], dim=0)
train_attention_masks = torch.cat([enc['attention_mask'] for enc in train_encodings], dim=0)
train_labels = torch.tensor(train_labels)

test_input_ids = torch.cat([enc['input_ids'] for enc in test_encodings], dim=0)
test_attention_masks = torch.cat([enc['attention_mask'] for enc in test_encodings], dim=0)

# Split data into train and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    train_input_ids, train_attention_masks, train_labels,
    random_state=42, test_size=0.1
)

# Create data loaders
batch_size = 30

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Load pre-trained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

# Set up GPU/CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1)):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print("Average training loss: {}".format(total_loss / len(train_dataloader)))

# Validation loop
model.eval()
val_predictions = []
val_true_labels = []

for batch in tqdm(val_dataloader, desc="Validation"):
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    val_predictions.extend(torch.argmax(logits, axis=1).tolist())
    val_true_labels.extend(inputs['labels'].tolist())

# Calculate accuracy
val_accuracy = accuracy_score(val_true_labels, val_predictions)
print("Validation Accuracy: {:.2f}%".format(val_accuracy * 100))

# Make predictions on test set
model.eval()
test_predictions = []

for i in tqdm(range(0, len(test_input_ids), batch_size), desc="Test Prediction"):
    batch_input_ids = test_input_ids[i:i+batch_size].to(device)
    batch_attention_masks = test_attention_masks[i:i+batch_size].to(device)
    with torch.no_grad():
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
    logits = outputs.logits
    batch_predictions = torch.argmax(logits, axis=1).tolist()
    test_predictions.extend(batch_predictions)




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 160/160 [00:25<00:00,  6.22it/s]


Average training loss: 0.4425696157850325


Epoch 2: 100%|██████████| 160/160 [00:26<00:00,  6.15it/s]


Average training loss: 0.3105613137129694


Epoch 3: 100%|██████████| 160/160 [00:25<00:00,  6.20it/s]


Average training loss: 0.21648280140943826


Validation: 100%|██████████| 18/18 [00:01<00:00, 16.95it/s]


Validation Accuracy: 79.36%


Test Prediction: 100%|██████████| 77/77 [00:04<00:00, 18.10it/s]


albert


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Load data
train_df = pd.read_csv("/content/drive/MyDrive/llm workshop/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/llm workshop/test.csv")

# Preprocess data
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

def preprocess_text(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=64,  # Adjust this parameter to your desired maximum length
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Handle sequences longer than max_length by truncating
    if len(encoding['input_ids'][0]) > 64:
        encoding['input_ids'] = encoding['input_ids'][:, :64]
        encoding['attention_mask'] = encoding['attention_mask'][:, :64]

    return encoding

train_texts = train_df['text'].values
train_labels = train_df['target'].values
test_texts = test_df['text'].values

train_encodings = [preprocess_text(text) for text in train_texts]
test_encodings = [preprocess_text(text) for text in test_texts]

train_input_ids = torch.cat([enc['input_ids'] for enc in train_encodings], dim=0)
train_attention_masks = torch.cat([enc['attention_mask'] for enc in train_encodings], dim=0)
train_labels = torch.tensor(train_labels)

test_input_ids = torch.cat([enc['input_ids'] for enc in test_encodings], dim=0)
test_attention_masks = torch.cat([enc['attention_mask'] for enc in test_encodings], dim=0)

# Split data into train and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    train_input_ids, train_attention_masks, train_labels,
    random_state=42, test_size=0.1
)

# Create data loaders
batch_size = 30

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Load pre-trained ALBERT model
model = AlbertForSequenceClassification.from_pretrained(
    "albert-base-v2",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

# Set up GPU/CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1)):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print("Average training loss: {}".format(total_loss / len(train_dataloader)))

# Validation loop
model.eval()
val_predictions = []
val_true_labels = []

for batch in tqdm(val_dataloader, desc="Validation"):
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    val_predictions.extend(torch.argmax(logits, axis=1).tolist())
    val_true_labels.extend(inputs['labels'].tolist())

# Calculate accuracy
val_accuracy = accuracy_score(val_true_labels, val_predictions)
print("Validation Accuracy: {:.2f}%".format(val_accuracy * 100))

# Make predictions on test set
model.eval()
test_predictions = []

for i in tqdm(range(0, len(test_input_ids), batch_size), desc="Test Prediction"):
    batch_input_ids = test_input_ids[i:i+batch_size].to(device)
    batch_attention_masks = test_attention_masks[i:i+batch_size].to(device)
    with torch.no_grad():
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
    logits = outputs.logits
    batch_predictions = torch.argmax(logits, axis=1).tolist()
    test_predictions.extend(batch_predictions)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 160/160 [00:49<00:00,  3.21it/s]


Average training loss: 0.4712529475800693


Epoch 2: 100%|██████████| 160/160 [00:52<00:00,  3.06it/s]


Average training loss: 0.3983704999089241


Epoch 3: 100%|██████████| 160/160 [00:53<00:00,  3.02it/s]


Average training loss: 0.3684058860410005


Validation: 100%|██████████| 18/18 [00:02<00:00,  7.81it/s]


Validation Accuracy: 83.68%


Test Prediction: 100%|██████████| 77/77 [00:09<00:00,  8.21it/s]


deberta


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DebertaTokenizer, DebertaForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Load data
train_df = pd.read_csv("/content/drive/MyDrive/llm workshop/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/llm workshop/test.csv")

# Preprocess data
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

def preprocess_text(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=64,  # Adjust this parameter to your desired maximum length
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Handle sequences longer than max_length by truncating
    if len(encoding['input_ids'][0]) > 64:
        encoding['input_ids'] = encoding['input_ids'][:, :64]
        encoding['attention_mask'] = encoding['attention_mask'][:, :64]

    return encoding

train_texts = train_df['text'].values
train_labels = train_df['target'].values
test_texts = test_df['text'].values

train_encodings = [preprocess_text(text) for text in train_texts]
test_encodings = [preprocess_text(text) for text in test_texts]

train_input_ids = torch.cat([enc['input_ids'] for enc in train_encodings], dim=0)
train_attention_masks = torch.cat([enc['attention_mask'] for enc in train_encodings], dim=0)
train_labels = torch.tensor(train_labels)

test_input_ids = torch.cat([enc['input_ids'] for enc in test_encodings], dim=0)
test_attention_masks = torch.cat([enc['attention_mask'] for enc in test_encodings], dim=0)

# Split data into train and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    train_input_ids, train_attention_masks, train_labels,
    random_state=42, test_size=0.1
)

# Create data loaders
batch_size = 30

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Load pre-trained DeBERTa model
model = DebertaForSequenceClassification.from_pretrained(
    "microsoft/deberta-base",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

# Set up GPU/CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1)):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print("Average training loss: {}".format(total_loss / len(train_dataloader)))

# Validation loop
model.eval()
val_predictions = []
val_true_labels = []

for batch in tqdm(val_dataloader, desc="Validation"):
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    val_predictions.extend(torch.argmax(logits, axis=1).tolist())
    val_true_labels.extend(inputs['labels'].tolist())

# Calculate accuracy
val_accuracy = accuracy_score(val_true_labels, val_predictions)
print("Validation Accuracy: {:.2f}%".format(val_accuracy * 100))

# Make predictions on test set
model.eval()
test_predictions = []

for i in tqdm(range(0, len(test_input_ids), batch_size), desc="Test Prediction"):
    batch_input_ids = test_input_ids[i:i+batch_size].to(device)
    batch_attention_masks = test_attention_masks[i:i+batch_size].to(device)
    with torch.no_grad():
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
    logits = outputs.logits
    batch_predictions = torch.argmax(logits, axis=1).tolist()
    test_predictions.extend(batch_predictions)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 160/160 [01:00<00:00,  2.65it/s]


Average training loss: 0.48151870593428614


Epoch 2: 100%|██████████| 160/160 [01:02<00:00,  2.54it/s]


Average training loss: 0.33375137485563755


Epoch 3: 100%|██████████| 160/160 [01:02<00:00,  2.54it/s]


Average training loss: 0.2698024996323511


Validation: 100%|██████████| 18/18 [00:02<00:00,  8.27it/s]


Validation Accuracy: 81.61%


Test Prediction: 100%|██████████| 77/77 [00:09<00:00,  8.29it/s]


electra

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import ElectraTokenizer, ElectraForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Load data
train_df = pd.read_csv("/content/drive/MyDrive/llm workshop/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/llm workshop/test.csv")

# Preprocess data
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

def preprocess_text(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=64,  # Adjust this parameter to your desired maximum length
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Handle sequences longer than max_length by truncating
    if len(encoding['input_ids'][0]) > 64:
        encoding['input_ids'] = encoding['input_ids'][:, :64]
        encoding['attention_mask'] = encoding['attention_mask'][:, :64]

    return encoding

train_texts = train_df['text'].values
train_labels = train_df['target'].values
test_texts = test_df['text'].values

train_encodings = [preprocess_text(text) for text in train_texts]
test_encodings = [preprocess_text(text) for text in test_texts]

train_input_ids = torch.cat([enc['input_ids'] for enc in train_encodings], dim=0)
train_attention_masks = torch.cat([enc['attention_mask'] for enc in train_encodings], dim=0)
train_labels = torch.tensor(train_labels)

test_input_ids = torch.cat([enc['input_ids'] for enc in test_encodings], dim=0)
test_attention_masks = torch.cat([enc['attention_mask'] for enc in test_encodings], dim=0)

# Split data into train and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    train_input_ids, train_attention_masks, train_labels,
    random_state=42, test_size=0.1
)

# Create data loaders
batch_size = 28

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Load pre-trained Electra model
model = ElectraForSequenceClassification.from_pretrained(
    "google/electra-base-discriminator",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

# Set up GPU/CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1)):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print("Average training loss: {}".format(total_loss / len(train_dataloader)))

# Validation loop
model.eval()
val_predictions = []
val_true_labels = []

for batch in tqdm(val_dataloader, desc="Validation"):
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    val_predictions.extend(torch.argmax(logits, axis=1).tolist())
    val_true_labels.extend(inputs['labels'].tolist())

# Calculate accuracy
val_accuracy = accuracy_score(val_true_labels, val_predictions)
print("Validation Accuracy: {:.2f}%".format(val_accuracy * 100))

# Make predictions on test set
model.eval()
test_predictions = []

for i in tqdm(range(0, len(test_input_ids), batch_size), desc="Test Prediction"):
    batch_input_ids = test_input_ids[i:i+batch_size].to(device)
    batch_attention_masks = test_attention_masks[i:i+batch_size].to(device)
    with torch.no_grad():
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
    logits = outputs.logits
    batch_predictions = torch.argmax(logits, axis=1).tolist()
    test_predictions.extend(batch_predictions)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 172/172 [00:50<00:00,  3.41it/s]


Average training loss: 0.48683822999686693


Epoch 2: 100%|██████████| 172/172 [00:53<00:00,  3.22it/s]


Average training loss: 0.35504134887352934


Epoch 3: 100%|██████████| 172/172 [00:52<00:00,  3.25it/s]


Average training loss: 0.2760889900111875


Validation: 100%|██████████| 20/20 [00:02<00:00,  9.31it/s]


Validation Accuracy: 80.49%


Test Prediction: 100%|██████████| 82/82 [00:09<00:00,  9.04it/s]


In [16]:
# Save predictions
test_df['target'] = test_predictions
test_df[['id', 'target']].to_csv("/content/drive/MyDrive/predictions_electra.csv", index=False)

xlnet


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Load data
train_df = pd.read_csv("/content/drive/MyDrive/llm workshop/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/llm workshop/test.csv")

# Preprocess data
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

def preprocess_text(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=64,  # Adjust this parameter to your desired maximum length
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Handle sequences longer than max_length by truncating
    if len(encoding['input_ids'][0]) > 64:
        encoding['input_ids'] = encoding['input_ids'][:, :64]
        encoding['attention_mask'] = encoding['attention_mask'][:, :64]

    return encoding

train_texts = train_df['text'].values
train_labels = train_df['target'].values
test_texts = test_df['text'].values

train_encodings = [preprocess_text(text) for text in train_texts]
test_encodings = [preprocess_text(text) for text in test_texts]

train_input_ids = torch.cat([enc['input_ids'] for enc in train_encodings], dim=0)
train_attention_masks = torch.cat([enc['attention_mask'] for enc in train_encodings], dim=0)
train_labels = torch.tensor(train_labels)

test_input_ids = torch.cat([enc['input_ids'] for enc in test_encodings], dim=0)
test_attention_masks = torch.cat([enc['attention_mask'] for enc in test_encodings], dim=0)

# Split data into train and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    train_input_ids, train_attention_masks, train_labels,
    random_state=42, test_size=0.1
)

# Create data loaders
batch_size = 30

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Load pre-trained XLNet model
model = XLNetForSequenceClassification.from_pretrained(
    "xlnet-base-cased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

# Set up GPU/CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1)):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print("Average training loss: {}".format(total_loss / len(train_dataloader)))

# Validation loop
model.eval()
val_predictions = []
val_true_labels = []

for batch in tqdm(val_dataloader, desc="Validation"):
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    val_predictions.extend(torch.argmax(logits, axis=1).tolist())
    val_true_labels.extend(inputs['labels'].tolist())

# Calculate accuracy
val_accuracy = accuracy_score(val_true_labels, val_predictions)
print("Validation Accuracy: {:.2f}%".format(val_accuracy * 100))

# Make predictions on test set
model.eval()
test_predictions = []

for i in tqdm(range(0, len(test_input_ids), batch_size), desc="Test Prediction"):
    batch_input_ids = test_input_ids[i:i+batch_size].to(device)
    batch_attention_masks = test_attention_masks[i:i+batch_size].to(device)
    with torch.no_grad():
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
    logits = outputs.logits
    batch_predictions = torch.argmax(logits, axis=1).tolist()
    test_predictions.extend(batch_predictions)


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 160/160 [01:00<00:00,  2.66it/s]


Average training loss: 0.47144935047253966


Epoch 2: 100%|██████████| 160/160 [01:01<00:00,  2.58it/s]


Average training loss: 0.3590879751369357


Epoch 3:  88%|████████▊ | 141/160 [00:54<00:07,  2.60it/s]