In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer
import torch
from torch import nn
from transformers import AutoModel, AutoConfig
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

2024-09-06 07:39:08.755777: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-06 07:39:08.755873: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-06 07:39:08.758162: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-06 07:39:08.773280: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
new_data = pd.read_csv("dataset_Sep_3_masked.csv", sep=",", engine="python")

In [4]:
# Prepare the data
sentences = new_data['text'].values
labels = new_data['archaia_or_not'].values

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the data
train_sentences, temp_sentences, train_labels, temp_labels = train_test_split(sentences, encoded_labels, 
                                                                              test_size=0.3, random_state=42, stratify=encoded_labels)
val_sentences, dev_sentences, val_labels, dev_labels = train_test_split(temp_sentences, temp_labels,
                                                                        test_size=0.5, random_state=42, stratify=temp_labels)

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/530k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [5]:
# Print label distribution
def print_label_distribution(labels, name):
    unique, counts = np.unique(labels, return_counts=True)
    print(f"\n{name} set label distribution:")
    for label, count in zip(unique, counts):
        print(f"Label {label}: {count}")

print_label_distribution(train_labels, "Training")
print_label_distribution(val_labels, "Validation")
print_label_distribution(dev_labels, "Dev")

# Tokenize and prepare the dataset
max_length = 512
train_encodings = tokenizer(train_sentences.tolist(), truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_sentences.tolist(), truncation=True, padding=True, max_length=max_length)
dev_encodings = tokenizer(dev_sentences.tolist(), truncation=True, padding=True, max_length=max_length)


Training set label distribution:
Label 0: 214
Label 1: 1144

Validation set label distribution:
Label 0: 46
Label 1: 245

Dev set label distribution:
Label 0: 46
Label 1: 245


In [6]:
# Define the model
class BertForSequenceClassification(nn.Module):
    def __init__(self, model_name_or_path, num_labels=2):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.config = AutoConfig.from_pretrained(model_name_or_path)
        self.bert = AutoModel.from_pretrained(model_name_or_path, config=self.config)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 256),
            nn.Dropout(0.1),
            nn.Linear(256, num_labels),
        )
        self.init_weights()

    def init_weights(self):
        for module in self.classifier:
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooler_output = outputs.pooler_output
        logits = self.classifier(pooler_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return (loss, logits) if loss is not None else logits
    
    def save_pretrained(self, save_directory):
        os.makedirs(save_directory, exist_ok=True)
        self.config.save_pretrained(save_directory)
        torch.save(self.state_dict(), os.path.join(save_directory, "pytorch_model.bin"))

    @classmethod
    def from_pretrained(cls, save_directory, model_name_or_path, num_labels=2):
        config = AutoConfig.from_pretrained(save_directory)
        model = cls(model_name_or_path, num_labels=num_labels)
        state_dict = torch.load(os.path.join(save_directory, "pytorch_model.bin"), map_location=torch.device('cpu'))
        model.load_state_dict(state_dict)
        return model

In [7]:
# Create dataset class
class GreekSentencesDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = GreekSentencesDataset(train_encodings, train_labels)
val_dataset = GreekSentencesDataset(val_encodings, val_labels)
dev_dataset = GreekSentencesDataset(dev_encodings, dev_labels)

# Initialize the model
model_name_or_path = "nlpaueb/bert-base-greek-uncased-v1"
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification(model_name_or_path, num_labels)

# Define training arguments and trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.02,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

pytorch_model.bin:   0%|          | 0.00/454M [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [10]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0009,0.076329,0.989691,0.993915,0.987903,1.0


TrainOutput(global_step=85, training_loss=0.39309312596040613, metrics={'train_runtime': 134.9228, 'train_samples_per_second': 10.065, 'train_steps_per_second': 0.63, 'total_flos': 0.0, 'train_loss': 0.39309312596040613, 'epoch': 1.0})

In [11]:
# Evaluate on dev set
dev_pred = trainer.predict(dev_dataset)
dev_preds = dev_pred.predictions.argmax(-1)
dev_labels = dev_dataset.labels

In [12]:
dev_accuracy = accuracy_score(dev_labels, dev_preds)
dev_precision, dev_recall, dev_f1, _ = precision_recall_fscore_support(dev_labels, dev_preds, average='binary')

print("\nDev Set Evaluation:")
print(f"Accuracy: {dev_accuracy:.4f}")
print(f"Precision: {dev_precision:.4f}")
print(f"Recall: {dev_recall:.4f}")
print(f"F1 Score: {dev_f1:.4f}")

# Create confusion matrix
cm = confusion_matrix(dev_labels, dev_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix.png')
plt.close()

# Save the model
model.save_pretrained("./binary_classifier_saved_model")
tokenizer.save_pretrained("./binary_classifier_saved_model")


Dev Set Evaluation:
Accuracy: 0.9897
Precision: 0.9879
Recall: 1.0000
F1 Score: 0.9939


('./binary_classifier_saved_model/tokenizer_config.json',
 './binary_classifier_saved_model/special_tokens_map.json',
 './binary_classifier_saved_model/vocab.txt',
 './binary_classifier_saved_model/added_tokens.json',
 './binary_classifier_saved_model/tokenizer.json')

In [13]:
import pandas as pd
import torch
from transformers import AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader

# Load the dataset
df = pd.read_csv("../twok_masked.csv")

# Load the saved model and tokenizer
loaded_model = BertForSequenceClassification.from_pretrained("./binary_classifier_saved_model", model_name_or_path="nlpaueb/bert-base-greek-uncased-v1")
loaded_tokenizer = AutoTokenizer.from_pretrained("./binary_classifier_saved_model")

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the appropriate device
loaded_model.to(device)

# Tokenize all texts
encodings = loaded_tokenizer(df['text'].tolist(), truncation=True, padding=True, max_length=512)
dataset = TensorDataset(torch.tensor(encodings['input_ids']), 
                        torch.tensor(encodings['attention_mask']))
dataloader = DataLoader(dataset, batch_size=32)  # Adjust batch size as needed

# Make predictions
loaded_model.eval()
predictions = []

Using device: cuda


In [14]:
with torch.no_grad():
    for batch in dataloader:
        input_ids, attention_mask = [b.to(device) for b in batch]
        outputs = loaded_model(input_ids, attention_mask=attention_mask)
        logits = outputs[0] if isinstance(outputs, tuple) else outputs
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

In [16]:
# Add predictions to the dataframe
df['archaia'] = predictions

# Check 'mask' column and set 'ΚΝΕ' to -9999 if mask is 0
df.loc[df['mask'] == 0, 'archaia'] = -9999

# Remove columns '1' through '5'
columns_to_remove = ['1', '2', '3', '4', '5']
df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])

# Save the results
df.to_csv("twok_masked_with_predictions.csv", index=False)

print(f"Processed {len(df)} rows.")
print("Results saved to 'twok_masked_with_predictions.csv'")

# Print distribution of predictions
print("\nDistribution of predictions:")
print(df['archaia'].value_counts(normalize=True))

# Print distribution of predictions for masked items only
masked_df = df[df['mask'] == 1]
print("\nDistribution of predictions for masked items:")
print(masked_df['archaia'].value_counts(normalize=True))

Processed 2000 rows.
Results saved to 'twok_masked_with_predictions.csv'

Distribution of predictions:
archaia
 1       0.7825
-9999    0.2175
Name: proportion, dtype: float64

Distribution of predictions for masked items:
archaia
1    1.0
Name: proportion, dtype: float64
