In [1]:
import pandas as pd
import numpy as np
import os
import csv
from zipfile import ZipFile
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataloader import default_collate
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from scipy.optimize import minimize
from transformers import AutoModel, AutoTokenizer, XLMRobertaModel
from tqdm.notebook import tqdm
from torch.nn.utils.rnn import pad_sequence
from typing import Dict, List, Tuple, Optional
import random
from torch.utils.data import random_split
import logging
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
class DataLoaders:
    def __init__(self):
        self.setup_paths()
        self.setup_logging()

    def setup_logging(self):
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger(__name__)

    def setup_paths(self):
        self.path_dev = 'dev/'
        self.path_train = 'train/'
        self.path_test = 'test/'
        self.path_output = 'answer/'
        self.path_testh = 'testh/'

        for path in [self.path_dev, self.path_train, self.path_test, self.path_output, self.path_testh]:
            if not os.path.exists(path):
                os.makedirs(path)

    def extract_zip_files(self):
        # Extract dev.zip
        if not os.listdir(self.path_dev):
            with ZipFile('dev.zip', 'r') as dev:
                dev.extractall(self.path_dev)

        # Extract train.zip
        if not os.listdir(self.path_train):
            with ZipFile('train.zip', 'r') as train:
                train.extractall(self.path_train)

        # Extract test.zip
        if not os.listdir(self.path_test):
            with ZipFile('test.zip', 'r') as test:
                test.extractall(self.path_test)

        # Extract testh.zip
        if not os.listdir(self.path_testh):
            with ZipFile('test_hidden1.zip', 'r') as testh:
                testh.extractall(self.path_testh)

    def load_tsv_files(self):
        languages = os.listdir(self.path_train)
        self.logger.info(f"Found languages: {languages}")

        # Initialize file paths
        label_file_paths_train = []
        uses_file_paths_train = []
        label_file_paths_dev = []
        uses_file_paths_dev = []
        instance_file_paths_test = []
        uses_file_paths_test = []
        label_file_paths_test = []


        for lang in languages:
            label_file_paths_train.append(f"{self.path_train}{lang}/labels.tsv")
            uses_file_paths_train.append(f"{self.path_train}{lang}/uses.tsv")
            label_file_paths_dev.append(f"{self.path_dev}{lang}/labels.tsv")
            uses_file_paths_dev.append(f"{self.path_dev}{lang}/uses.tsv")
            instance_file_paths_test.append(f"{self.path_test}{lang}/instances.tsv")
            uses_file_paths_test.append(f"{self.path_test}{lang}/uses.tsv")
            label_file_paths_test.append(f"{self.path_testh}{lang}/labels.tsv")
        paths = {
            'train_labels_list': label_file_paths_train,
            'train_uses_list': uses_file_paths_train,
            'dev_labels_list': label_file_paths_dev,
            'dev_uses_list': uses_file_paths_dev,
            'test_uses_list': uses_file_paths_test,
            'test_instances_list': instance_file_paths_test,
            'test_labels_list': label_file_paths_test
        }

        data_dict = {key: [] for key in paths.keys()}

        for save_path, path_list in paths.items():
            for path in path_list:
                with open(path, encoding='utf-8') as tsvfile:
                    language = path.split('/')[1]
                    reader = csv.DictReader(tsvfile, delimiter='\t',
                                         quoting=csv.QUOTE_MINIMAL, quotechar='"')
                    for row in reader:
                        row['language'] = language
                        data_dict[save_path].append(row)

        return data_dict

    def create_merged_dataframes(self, data_dict):
        def create_mappings(uses_list):
            id2context = {}
            id2idx = {}
            for row in uses_list:
                identifier = row['identifier']
                id2context[identifier] = row['context']
                id2idx[identifier] = row['indices_target_token']
            return id2context, id2idx

        # Create mappings
        train_id2context, train_id2idx = create_mappings(data_dict['train_uses_list'])
        dev_id2context, dev_id2idx = create_mappings(data_dict['dev_uses_list'])
        test_id2context, test_id2idx = create_mappings(data_dict['test_uses_list'])

        # Merge train data
        train_uses_merged = []
        for row in data_dict['train_labels_list']:
            identifier1_train = row['identifier1']
            identifier2_train = row['identifier2']

            data_row = {
                'context1': train_id2context.get(identifier1_train),
                'context2': train_id2context.get(identifier2_train),
                'index_target_token1': train_id2idx.get(identifier1_train),
                'index_target_token2': train_id2idx.get(identifier2_train),
                'identifier1': identifier1_train,
                'identifier2': identifier2_train,
                'lemma': row['lemma'],
                'median_cleaned': row['median_cleaned'],
                'judgments': row['judgments'],
                'language': row['language']
            }
            train_uses_merged.append(data_row)


        dev_uses_merged = []
        for row in data_dict['dev_labels_list']:
            identifier1_dev = row['identifier1']
            identifier2_dev = row['identifier2']

            data_row = {
                'context1': dev_id2context.get(identifier1_dev),
                'context2': dev_id2context.get(identifier2_dev),
                'index_target_token1': dev_id2idx.get(identifier1_dev),
                'index_target_token2': dev_id2idx.get(identifier2_dev),
                'identifier1': identifier1_dev,
                'identifier2': identifier2_dev,
                'lemma': row['lemma'],
                'median_cleaned': row['median_cleaned'],
                'judgments': row['judgments'],
                'language': row['language']
            }
            dev_uses_merged.append(data_row)

        # Merge test data
        test_uses_merged = []
        for row in data_dict['test_labels_list']:
            identifier1_test = row['identifier1']
            identifier2_test = row['identifier2']

            data_row = {
                'context1': test_id2context.get(identifier1_test),
                'context2': test_id2context.get(identifier2_test),
                'index_target_token1': test_id2idx.get(identifier1_test),
                'index_target_token2': test_id2idx.get(identifier2_test),
                'identifier1': identifier1_test,
                'identifier2': identifier2_test,
                'lemma': row['lemma'],
                'median_cleaned': row['median_cleaned'],
                'language': row['language']
            }
            test_uses_merged.append(data_row)

        return pd.DataFrame(train_uses_merged), pd.DataFrame(dev_uses_merged), pd.DataFrame(test_uses_merged)

In [3]:
data_loader = DataLoaders()
data_loader.extract_zip_files()
data_dict = data_loader.load_tsv_files()
df_train_uses_merged, df_dev_uses_merged, df_test_uses_merged = data_loader.create_merged_dataframes(data_dict)

2024-12-10 12:34:55,721 - INFO - Found languages: ['norwegian', 'german', 'chinese', 'spanish', 'english', 'russian', 'swedish']


In [4]:
df_train_uses_merged['median_cleaned'].value_counts()

median_cleaned
4.0    30257
1.0     7099
3.0     5967
2.0     4510
Name: count, dtype: int64

In [5]:
display(df_train_uses_merged.head())
display(df_dev_uses_merged.head())
display(df_test_uses_merged.head())

Unnamed: 0,context1,context2,index_target_token1,index_target_token2,identifier1,identifier2,lemma,median_cleaned,judgments,language
0,573 F LR N (Samisk) (Sang- (Utkast Salmer boke...,Det er saerlig knyttet anfektelser til spørsmå...,116:126,23:34,1970-2015_anfektelse_0,1970-2015_anfektelse_15,anfektelse,3.0,"[3, 3]",norwegian
1,Trøst dem i all nød og anfektelse med din Hell...,Menneskelivet og kristenlivet I — V Kveldsbønn...,23:33,79:89,1970-2015_anfektelse_16,1970-2015_anfektelse_2,anfektelse,4.0,"[4, 4]",norwegian
2,Trøst dem i all nød og anfektelse med din Hell...,Trøst dem i all nød og anfektelse med din Hell...,23:33,23:33,1970-2015_anfektelse_16,1970-2015_anfektelse_17,anfektelse,4.0,"[4, 4, 4]",norwegian
3,Det er saerlig knyttet anfektelser til spørsmå...,Dernest hadde Løvenskiold anfektelser på grunn...,23:34,26:37,1970-2015_anfektelse_15,1970-2015_anfektelse_5,anfektelse,4.0,"[4, 4, 4]",norwegian
4,Og selv ikke kommunene later til å ha noen sae...,Dernest hadde Løvenskiold anfektelser på grunn...,52:63,26:37,1970-2015_anfektelse_11,1970-2015_anfektelse_5,anfektelse,4.0,"[4, 4, 4]",norwegian


Unnamed: 0,context1,context2,index_target_token1,index_target_token2,identifier1,identifier2,lemma,median_cleaned,judgments,language
0,"av forordningen, kan gi hjelp til at det blir ...",Finner retten på noe tidspunkt at etterforskni...,90:104,34:50,1929-1965_etterforskning_9,1970-2015_etterforskning_1824,etterforskning,4.0,"[4, 4, 4]",norwegian
1,Og det sier seg selv at det da ikke alltid er ...,Dette førte til ny interesse for saken i offen...,112:126,298:312,1929-1965_etterforskning_54,1970-2015_etterforskning_1032,etterforskning,4.0,"[4, 4]",norwegian
2,Resultatet av drøftelsene på Stortinget i 1948...,Det følgende vil redegjøre for enkelte øvrige ...,215:229,334:350,1929-1965_etterforskning_52,1970-2015_etterforskning_998,etterforskning,4.0,"[4, 4]",norwegian
3,Resultatet av drøftelsene på Stortinget i 1948...,Alene den lange tid - 25 år - som har gått sid...,215:229,180:194,1929-1965_etterforskning_52,1970-2015_etterforskning_557,etterforskning,4.0,"[4, 4]",norwegian
4,Anm.: 1 anledning spørsmålet om etterforskning...,Målet må vaere å øke ressursene ved å foreta e...,32:46,79:93,1929-1965_etterforskning_62,1970-2015_etterforskning_318,etterforskning,4.0,"[4, 4]",norwegian


Unnamed: 0,context1,context2,index_target_token1,index_target_token2,identifier1,identifier2,lemma,median_cleaned,language
0,"3. Egne hjem for krigspensjonister, der graden...",Han gjennomførte flere førstebestigninger både...,249:254,695:700,1980-1990_syden_4,2012-2019_syden_4,Syden,4.0,norwegian
1,"«Hvi kan I innbille Jer, I, dårlige menneske, ...",For kortere perioder kan det vaere aktuelt å l...,128:133,211:216,1980-1990_syden_7,1980-1990_syden_8,Syden,4.0,norwegian
2,"«Hvi kan I innbille Jer, I, dårlige menneske, ...",utdannelse Hovedfag i samfunnsgeografi ved Uni...,128:133,164:169,1980-1990_syden_7,2012-2019_syden_111,Syden,4.0,norwegian
3,"«Hvi kan I innbille Jer, I, dårlige menneske, ...",Men da artisjokk helkokt i klarnet smør skulle...,128:133,82:87,1980-1990_syden_7,2012-2019_syden_187,Syden,4.0,norwegian
4,"«Hvi kan I innbille Jer, I, dårlige menneske, ...",Pris pr. person i 2-romsleilighet på Sea Melod...,128:133,316:321,1980-1990_syden_7,2012-2019_syden_22,Syden,4.0,norwegian


In [13]:
#df_dev_uses_merged['median_cleaned'].value_counts()

In [14]:
#df_train_uses_merged['median_cleaned'].value_counts()


In [15]:
#df_train_uses_merged['language'].value_counts()

In [12]:
df_train_uses_merged[df_train_uses_merged['language']=='english']

Unnamed: 0,context1,context2,index_target_token1,index_target_token2,identifier1,identifier2,lemma,median_cleaned,judgments,language
28437,"I'd landed in early afternoon, local time.",Old shopping lists and ticket stubs and wads o...,20:29,104:113,fic_1987_780057.txt-318-5,fic_1992_40272.txt-627-18,afternoon_nn,4.0,"[4, 4]",english
28438,"I'd landed in early afternoon, local time.",@@774723 1 Two Years Earlier // One weekday af...,20:29,44:53,fic_1987_780057.txt-318-5,nf_2008_774723.txt-0-8,afternoon_nn,4.0,"[4, 4]",english
28439,"I'd landed in early afternoon, local time.","afternoon that the Indian bought me, he starte...",20:29,0:9,fic_1987_780057.txt-318-5,nf_1849_762578.txt-1803-0,afternoon_nn,4.0,"[4, 4]",english
28440,"Mrs. Daggett, nettled by her sister's hint, ro...",She remained in company with the cows all the ...,91:100,46:55,fic_1822_7275.txt-216-23,fic_1850_2310.txt-1321-9,afternoon_nn,4.0,"[4, 4, 4]",english
28441,"Elvira had just got possession, by stealth, of...","On those afternoons by the sea, we knitted, Sh...",201:210,9:19,fic_1822_7275.txt-530-41,fic_1971_14051.txt-667-2,afternoon_nn,4.0,"[4, 3, 4]",english
...,...,...,...,...,...,...,...,...,...,...
34342,"I described my impressions frankly, and with w...",To have this knowledge implies that children h...,96:101,225:230,fic_1853_9200.txt-674-17,nf_1996_774044.txt-1075-37,word_nn,4.0,"[4, 4]",english
34343,If they would reflect that this word sovereign...,let himself go to his wife in the words that s...,32:36,34:39,fic_1844_8787.txt-57-6,mag_1963_441107.txt-252-8,word_nn,3.0,"[3, 3]",english
34344,If they would reflect that this word sovereign...,"a book, by the way, concerning which no adequa...",32:36,49:53,fic_1844_8787.txt-57-6,mag_1857_526498.txt-137-11,word_nn,3.0,"[3, 3]",english
34345,I passed among the men with a word of encourag...,"I was then taken below, and put into a cot, wh...",30:34,174:179,fic_1835_7143.txt-1235-7,fic_1839_7023.txt-2324-35,word_nn,4.0,"[4, 4]",english


In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_train_uses_merged['median_cleaned'] = label_encoder.fit_transform(df_train_uses_merged['median_cleaned'])
df_dev_uses_merged['median_cleaned'] = label_encoder.transform(df_dev_uses_merged['median_cleaned'])

In [9]:
class EmbeddingGenerator:
    def __init__(self, model_name="FacebookAI/xlm-roberta-base"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = XLMRobertaModel.from_pretrained(model_name)

    def truncation_indices(self, target_subword_indices, truncation_tokens_before_target=0.5):
        max_tokens = 512
        n_target_subtokens = target_subword_indices.count(True)
        tokens_before = int((max_tokens - n_target_subtokens) * truncation_tokens_before_target)
        tokens_after = max_tokens - tokens_before - n_target_subtokens

        lindex_target = target_subword_indices.index(True)
        rindex_target = lindex_target + n_target_subtokens
        lindex = max(lindex_target - tokens_before, 0)
        rindex = rindex_target + tokens_after

        return lindex, rindex

    def get_target_token_embedding(self, context, index):
        start_idx = int(str(index).strip().split(':')[0])
        end_idx = int(str(index).strip().split(':')[1])

        inputs = self.tokenizer(context, return_tensors="pt",
                              return_offsets_mapping=True, add_special_tokens=False)

        offset_mapping = inputs['offset_mapping'][0].tolist()
        input_ids = inputs['input_ids']
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])

        subwords_bool_mask = [
            (start <= start_idx < end) or (start < end_idx <= end)
            or (start_idx <= start and end <= end_idx)
            for start, end in offset_mapping
        ]

        if len(input_ids[0]) > 512:
            lindex, rindex = self.truncation_indices(subwords_bool_mask)
            tokens = tokens[lindex:rindex]
            input_ids = input_ids[:, lindex:rindex]
            subwords_bool_mask = subwords_bool_mask[lindex:rindex]
            inputs['input_ids'] = input_ids

        with torch.no_grad():
            outputs = self.model(inputs['input_ids'])

        target_embeddings = outputs.last_hidden_state[0][subwords_bool_mask]
        return target_embeddings.mean(dim=0).numpy()

    def generate_embeddings(self, df, file_name):
        id2embedding = {}

        for _, row in df.iterrows():
            identifier1 = row['identifier1']
            identifier2 = row['identifier2']

            if identifier1 not in id2embedding:
                embedding1 = self.get_target_token_embedding(row['context1'],
                                                          row['index_target_token1'])
                id2embedding[identifier1] = embedding1

            if identifier2 not in id2embedding:
                embedding2 = self.get_target_token_embedding(row['context2'],
                                                          row['index_target_token2'])
                id2embedding[identifier2] = embedding2

        np.savez(file_name, **id2embedding)
        return id2embedding

In [10]:
# Generate embeddings
#embedding_generator = EmbeddingGenerator()
#train_embeddings = embedding_generator.generate_embeddings(df_train_uses_merged,
                                                         #'subtask1_train_embeddings.npz')
#dev_embeddings = embedding_generator.generate_embeddings(df_dev_uses_merged,
                                                       #'subtask1_dev_embeddings.npz')

In [11]:
class CustomDataset(Dataset):
    def __init__(self, data, embeddings, max_len=512, is_test=False):
        self.data = data
        self.embeddings = self.load_embeddings(embeddings)
        self.max_len = max_len
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def load_embeddings(self, file_name):
        """Load embeddings from a single .npz file."""
        try:
            loaded_embeddings = np.load(file_name)
            embeddings = {identifier: loaded_embeddings[identifier]
                        for identifier in loaded_embeddings.files}
            return embeddings
        except Exception as e:
            print(f"Error loading embeddings from {file_name}: {e}")
            return {}

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        identifier1 = row['identifier1']
        identifier2 = row['identifier2']

        # Get embeddings with zero padding if not found
        embedding1 = self.embeddings.get(identifier1, np.zeros(self.max_len))
        embedding2 = self.embeddings.get(identifier2, np.zeros(self.max_len))

        # Convert embeddings to tensors
        target_embedding1 = torch.tensor(embedding1, dtype=torch.float)
        target_embedding2 = torch.tensor(embedding2, dtype=torch.float)

        if not self.is_test:
            # Get the median_cleaned value as a float for regression
            target = torch.tensor(float(row['median_cleaned']), dtype=torch.float)

            return {
                'target_embedding1': target_embedding1,
                'target_embedding2': target_embedding2,
                'target': target
            }
        else:
            return {
                'target_embedding1': target_embedding1,
                'target_embedding2': target_embedding2
            }

def collate_fn(batch):
    """Custom collate function to handle dictionary outputs."""
    return {key: default_collate([d[key] for d in batch]) for key in batch[0]}

In [12]:
train_dataset = CustomDataset(data=df_train_uses_merged,
                            embeddings='subtask1_train_embeddings.npz',
                            max_len=512,
                            is_test=False)

dev_dataset = CustomDataset(data=df_dev_uses_merged,
                          embeddings='subtask1_dev_embeddings.npz',
                          max_len=512,
                          is_test=False)

test_dataset = CustomDataset(data=df_test_uses_merged,
                          embeddings='subtask1_test_embeddings.npz',
                          max_len=512,
                          is_test=True)


In [13]:
#CUDA_LAUNCH_BLOCKING=1

In [14]:
from transformers import XLMRobertaModel

In [15]:
# Load pre-trained XLM-RoBERTa model
roberta = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")



In [None]:
class XLMRobertaModel(nn.Module):
    def __init__(self, 
                 embedding_dim=768,  
                 dropout_rate=0.2, 
                 num_classes=4 
                 ):
        super().__init__()

        self.roberta = roberta
        
        # Calculate the total input size for the classifier
        classifier_input_size = embedding_dim * 2 

        
        # Custom classifier
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(classifier_input_size, num_classes)
        )
        
        # Apply weight initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()

    def forward(self, embedding1, embedding2):
        #print(f"Embedding1 shape: {embedding1.shape}")
        #print(f"Embedding2 shape: {embedding2.shape}")
        #print(f"Combined features shape: {combined_features.shape}")

        # Validate input shapes
        assert embedding1.size(1) == embedding2.size(1), "Embeddings must have the same dimension"
        #assert embedding1.size(0) == embedding2.size(0) == combined_features.size(0), "Batch sizes must match"

        # Concatenate embeddings and combined features
        features = torch.cat((embedding1, embedding2), dim=1)
        #print(f"Concatenated features shape: {features.shape}")

        # Pass through classifier
        return self.classifier(features)


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

class Learner:
    def __init__(self,
                 learning_rate=1e-4,
                 batch_size=32,
                 #num_combined_features=4,
                 num_epochs=10,
                 dropout_rate=0.2,
                 num_classes=4,
                 device='cuda' if torch.cuda.is_available() else 'cpu'):

        self.model = XLMRobertaModel(
            dropout_rate=dropout_rate,
            num_classes=num_classes,
            #num_combined_features=num_combined_features
        ).to(device)

        self.optimizer = optim.AdamW(self.model.parameters(), lr=learning_rate)
        self.criterion = nn.CrossEntropyLoss()
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.device = device

        self.train_losses = []
        self.val_losses = []
        self.train_accuracies = []
        self.val_accuracies = []
    def train(self, train_dataset, dev_dataset):
        train_loader = DataLoader(
            train_dataset,
            batch_size=self.batch_size,
            shuffle=True
        )

        if dev_dataset:
            val_loader = DataLoader(
                dev_dataset,
                batch_size=self.batch_size,
                shuffle=False
            )

        best_val_loss = float('inf')

        for epoch in range(self.num_epochs):
            self.model.train()
            total_train_loss = 0
            total_train_correct = 0
            train_batches = tqdm(train_loader, desc=f'Epoch {epoch+1}/{self.num_epochs}')

            for batch in train_batches:
                self.optimizer.zero_grad()

                target_embedding1 = batch['target_embedding1'].to(self.device)
                target_embedding2 = batch['target_embedding2'].to(self.device)
                #combined_features = batch['combined_features'].to(self.device)
                target = batch['target'].squeeze().to(self.device).long()

                logits = self.model(target_embedding1, target_embedding2)
                loss = self.criterion(logits, target)

                loss.backward()
                self.optimizer.step()

                total_train_loss += loss.item()

                # Calculate accuracy
                preds = torch.argmax(logits, dim=1)
                total_train_correct += (preds == target).sum().item()

                train_batches.set_postfix({'train_loss': loss.item()})

            avg_train_loss = total_train_loss / len(train_loader)
            avg_train_accuracy = total_train_correct / len(train_loader.dataset)  # Accuracy over all data
            self.train_losses.append(avg_train_loss)
            self.train_accuracies.append(avg_train_accuracy)

            if dev_dataset:
                val_loss, val_accuracy = self.evaluate(val_loader)
                self.val_losses.append(val_loss)
                self.val_accuracies.append(val_accuracy)

                print(f'Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Train Accuracy = {avg_train_accuracy:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}')

                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    torch.save(self.model.state_dict(), 'xlm_sole.pt')
            else:
                print(f'Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Train Accuracy = {avg_train_accuracy:.4f}')

        return self.train_losses, self.val_losses, self.train_accuracies, self.val_accuracies

    def evaluate(self, val_loader):
        self.model.eval()
        total_val_loss = 0
        total_val_correct = 0
        all_preds = []
        all_targets = []

        with torch.no_grad():
            for batch in val_loader:
                target_embedding1 = batch['target_embedding1'].to(self.device)
                target_embedding2 = batch['target_embedding2'].to(self.device)
                #combined_features = batch['combined_features'].to(self.device)
                target = batch['target'].squeeze().to(self.device).long()

                logits = self.model(target_embedding1, target_embedding2)
                loss = self.criterion(logits, target)

                total_val_loss += loss.item()

                # Calculate accuracy
                preds = torch.argmax(logits, dim=1)
                total_val_correct += (preds == target).sum().item()

                all_preds.extend(preds.cpu().numpy())
                all_targets.extend(target.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = total_val_correct / len(val_loader.dataset)  # Accuracy over all data
        precision, recall, f1, _ = precision_recall_fscore_support(all_targets, all_preds, average='macro', zero_division=0)

        print(f"Validation Metrics: Accuracy = {val_accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1-Score = {f1:.4f}")

        return avg_val_loss, val_accuracy

    def predict(self, test_dataset, return_probabilities=False):
        test_loader = DataLoader(
            test_dataset,
            batch_size=self.batch_size,
            shuffle=False
        )

        self.model.eval()
        predictions = []

        with torch.no_grad():
            for batch in test_loader:
                target_embedding1 = batch['target_embedding1'].to(self.device)
                target_embedding2 = batch['target_embedding2'].to(self.device)
                #combined_features = batch['combined_features'].to(self.device)

                logits = self.model(target_embedding1, target_embedding2)

                if return_probabilities:
                    probs = torch.softmax(logits, dim=1)
                    predictions.extend(probs.cpu().numpy())
                else:
                    preds = torch.argmax(logits, dim=1)
                    predictions.extend(preds.cpu().numpy())

        return np.array(predictions)

In [None]:
learner = Learner(
    learning_rate=1e-4,
    batch_size=32,
    num_epochs=10
)
train_losses, val_losses, train_accuracies, val_accuracies = learner.train(train_dataset, dev_dataset)

Epoch 1/10:   0%|          | 0/1495 [00:00<?, ?it/s]

Validation Metrics: Accuracy = 0.6838, Precision = 0.4158, Recall = 0.3148, F1-Score = 0.3198
Epoch 1: Train Loss = 0.8580, Train Accuracy = 0.6634, Val Loss = 0.8737, Val Accuracy = 0.6838


Epoch 2/10:   0%|          | 0/1495 [00:00<?, ?it/s]

Validation Metrics: Accuracy = 0.6824, Precision = 0.4210, Recall = 0.3564, F1-Score = 0.3594
Epoch 2: Train Loss = 0.7349, Train Accuracy = 0.7034, Val Loss = 0.9006, Val Accuracy = 0.6824


Epoch 3/10:   0%|          | 0/1495 [00:00<?, ?it/s]

Validation Metrics: Accuracy = 0.6824, Precision = 0.4262, Recall = 0.3623, F1-Score = 0.3651
Epoch 3: Train Loss = 0.6937, Train Accuracy = 0.7196, Val Loss = 0.9194, Val Accuracy = 0.6824


Epoch 4/10:   0%|          | 0/1495 [00:00<?, ?it/s]

Validation Metrics: Accuracy = 0.6730, Precision = 0.4048, Recall = 0.3811, F1-Score = 0.3797
Epoch 4: Train Loss = 0.6649, Train Accuracy = 0.7320, Val Loss = 0.9432, Val Accuracy = 0.6730


Epoch 5/10:   0%|          | 0/1495 [00:00<?, ?it/s]

Validation Metrics: Accuracy = 0.6778, Precision = 0.4123, Recall = 0.3615, F1-Score = 0.3669
Epoch 5: Train Loss = 0.6399, Train Accuracy = 0.7427, Val Loss = 0.9757, Val Accuracy = 0.6778


Epoch 6/10:   0%|          | 0/1495 [00:00<?, ?it/s]

Validation Metrics: Accuracy = 0.6758, Precision = 0.4046, Recall = 0.3684, F1-Score = 0.3718
Epoch 6: Train Loss = 0.6168, Train Accuracy = 0.7541, Val Loss = 0.9958, Val Accuracy = 0.6758


Epoch 7/10:   0%|          | 0/1495 [00:00<?, ?it/s]

Validation Metrics: Accuracy = 0.6788, Precision = 0.4062, Recall = 0.3744, F1-Score = 0.3736
Epoch 7: Train Loss = 0.5955, Train Accuracy = 0.7639, Val Loss = 0.9999, Val Accuracy = 0.6788


Epoch 8/10:   0%|          | 0/1495 [00:00<?, ?it/s]

Validation Metrics: Accuracy = 0.6733, Precision = 0.4016, Recall = 0.3907, F1-Score = 0.3844
Epoch 8: Train Loss = 0.5751, Train Accuracy = 0.7739, Val Loss = 1.0057, Val Accuracy = 0.6733


Epoch 9/10:   0%|          | 0/1495 [00:00<?, ?it/s]

Validation Metrics: Accuracy = 0.6871, Precision = 0.4231, Recall = 0.3712, F1-Score = 0.3761
Epoch 9: Train Loss = 0.5565, Train Accuracy = 0.7817, Val Loss = 1.0561, Val Accuracy = 0.6871


Epoch 10/10:   0%|          | 0/1495 [00:00<?, ?it/s]

Validation Metrics: Accuracy = 0.6802, Precision = 0.4200, Recall = 0.3891, F1-Score = 0.3922
Epoch 10: Train Loss = 0.5382, Train Accuracy = 0.7903, Val Loss = 1.0439, Val Accuracy = 0.6802


In [21]:
predictions = learner.predict(test_dataset)

In [22]:
#learner.load_state_dict(torch.load('best_adapter_model.pt'))
#learner.eval()

In [23]:
df_test_uses_merged['prediction'] = predictions

out_dir = 'answer/'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

answer_df = df_test_uses_merged[['identifier1', 'identifier2', 'language', 'prediction']].copy()

# Saving files per language
for language in answer_df["language"].unique():
    df_temp = answer_df[answer_df["language"] == language].drop('language', axis=1)
    df_temp['prediction'] = label_encoder.inverse_transform(df_temp['prediction'])
    df_temp.to_csv(f'{out_dir}{language}.tsv', index=False, sep='\t', quoting=csv.QUOTE_MINIMAL, quotechar='"')

with ZipFile('answer.zip', 'w') as zipf:
    for root, _, files in os.walk(out_dir):
        for file in files:
            zipf.write(os.path.join(root, file), arcname=file)

print("Saved answer.zip with predictions per language.")

Saved answer.zip with predictions per language.


In [24]:
df_temp['prediction'].value_counts()

prediction
4.0    1226
1.0     112
2.0       7
Name: count, dtype: int64

In [25]:
def true_labels(df, ref):

    if not os.path.exists(ref):
        os.makedirs(ref)

    languages = df['language'].unique()

    for language in languages:
        lang_df = df[df['language'] == language][['identifier1', 'identifier2', 'median_cleaned']]

        lang_dir = os.path.join(ref, language)
        if not os.path.exists(lang_dir):
            os.makedirs(lang_dir)

        lang_file_path = os.path.join(lang_dir, 'labels.tsv')
        lang_df.to_csv(lang_file_path, sep='\t', index=False, quoting=csv.QUOTE_NONE)
        print(f"Saved truth labels for {language} at {lang_file_path}")

true_labels(df_test_uses_merged, 'ref')

Saved truth labels for norwegian at ref/norwegian/labels.tsv
Saved truth labels for german at ref/german/labels.tsv
Saved truth labels for chinese at ref/chinese/labels.tsv
Saved truth labels for spanish at ref/spanish/labels.tsv
Saved truth labels for english at ref/english/labels.tsv
Saved truth labels for russian at ref/russian/labels.tsv
Saved truth labels for swedish at ref/swedish/labels.tsv
