In [1]:
CUDA_VISIBLE_DEVICES=1

In [2]:
import torch
import torch.nn as nn
import os
import logging
import numpy as np
import random
from tqdm import tqdm
import time
import pandas as pd

from transformers import LongformerModel, AutoTokenizer, LongformerForSequenceClassification, LongformerForMultipleChoice
from transformers import AutoTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report

logging.basicConfig(filename=f'./logs/train_{time.asctime().replace(" ","_")}.log', filemode='w', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Create a logger object
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Create a stream handler to print log messages to the console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

torch.manual_seed(40)
np.random.seed(40)
random.seed(40)
torch.cuda.manual_seed(40)
torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Define the path to the CSV file
train_csv_file = "/data1/debajyoti/colie/train.csv"
val_csv_file = "/data1/debajyoti/colie/valid.csv"
test_csv_file = "/data1/debajyoti/colie/test.csv"

# Read the CSV file
train_labels = pd.read_csv(train_csv_file)
val_labels = pd.read_csv(val_csv_file)
test_labels = pd.read_csv(test_csv_file)
test_labels

Unnamed: 0,BOOK_id
0,7616_1.txt
1,7616_2.txt
2,7616_3.txt
3,7616_4.txt
4,7616_5.txt
...,...
143025,5677_92.txt
143026,5677_93.txt
143027,5677_94.txt
143028,5677_95.txt


In [4]:
train_labels.BOOK_id[0]

'27993_1.txt'

In [33]:
# Define the path to the train folder
train_folder = "/data1/debajyoti/colie/train/train/"
# Define the path to the validation folder
val_folder = "/data1/debajyoti/colie/valid/valid/"
# Define the path to the test folder
test_folder = "/data1/debajyoti/colie/test/test/"



def create_df(folder, label):
    # Initialize empty lists to store the data
    text_data = []
    labels = []
    for index in label.index:
        # filename = df_labels.BOOK_id[index]
        # print(filename)
        # print(df_labels['BOOK_id'][index], df_labels['Epoch'][index])
        file_name = label['BOOK_id'][index]  # Assuming 'File Name' is the column name for the file names in the CSV

        # Construct the file path
        file_path = os.path.join(folder, file_name)

        # Read the text from the file
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
            text = file.read()

        # Append the text and label to the respective lists
        text_data.append(text)
        labels.append(label['Epoch'][index].strip())  # Assuming 'Label' is the column name for the labels in the CSV
        # break
    return text_data, labels

def create_df_test(folder, label):
    # Initialize empty lists to store the data
    text_data = []
    # labels = []
    for index in label.index:
        # filename = df_labels.BOOK_id[index]
        # print(filename)
        # print(df_labels['BOOK_id'][index], df_labels['Epoch'][index])
        file_name = label['BOOK_id'][index]  # Assuming 'File Name' is the column name for the file names in the CSV

        # Construct the file path
        file_path = os.path.join(folder, file_name)

        # Read the text from the file
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
            text = file.read()

        # Append the text and label to the respective lists
        text_data.append(text)
        # labels.append(label['Epoch'][index].strip())  # Assuming 'Label' is the column name for the labels in the CSV
        # break
    return text_data

train_data, train_label = create_df(train_folder, train_labels)
val_data, val_label = create_df(val_folder, val_labels)
test_data = create_df_test(test_folder, test_labels)

# Create a dataframe from the lists
train_df = pd.DataFrame({'text': train_data, 'label': train_label})
val_df = pd.DataFrame({'text': val_data, 'label': val_label})
test_df = pd.DataFrame({'text': test_data})
print(train_df.head(), val_df.head(), test_df.head())
print(train_df.shape, val_df.shape, test_df.shape)

                                                text      label
0  rifle; Ivan's was a double-barrelled shot-gun ...  Viktorian
1  upon the track of the bear. After following it...  Viktorian
2  to pull him out with their hands--even had the...  Viktorian
3  a slight sparkle of scientific conceit, "this ...  Viktorian
4  bears with a white ring round their necks? Yes...  Viktorian                                                 text      label
0  kind good morning, and returned her hearty emb...  Viktorian
1  sky, and of the moon, which clothed the old pi...  Viktorian
2  left Rome for Augsburg, my mind being much exc...  Viktorian
3  thoughts some of the old melodies he knew by h...  Viktorian
4  "But," said Henry, "is it not possible that th...  Viktorian                                                 text
0  "Alas, poor girl!" said I, "I fear that her ha...
1  to divide her attention between the said garco...
2  visitor's disposition to gallantry. However, s...
3  says Juvenal, "'M

In [34]:
label_dic = {'Romanticism':0,
            'Viktorian':1,
            'Modernism':2,
            'PostModernism':3,
            'OurDays':4}
train_df['label'] = train_df['label'].map(label_dic)
val_df['label'] = val_df['label'].map(label_dic)
# test['label'] = test['label'].map(label_dic)

In [35]:
# # Length of text
# def length (txt):
#     length = len(txt.split())
#     return length

# txt_length = train_df['text'].apply(lambda x: length(x))
# print(txt_length.sort_values(ascending = False))

In [36]:
val_df['label'].value_counts()

label
1    16938
2    14848
3     1713
4     1600
0     1158
Name: count, dtype: int64

In [37]:
# model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
# tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [38]:
max_length= 500
class CustomDataset(Dataset):
    def __init__(self, tokenizer, df):
        # Initialize thetokenizer
        self.tokenizer = tokenizer

        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Get the text and label from the dataframe
        text = self.df.iloc[index]['text']
        label = self.df.iloc[index]['label']

        # Tokenize the text and convert it to input IDs
        inputs = self.tokenizer(
            text,
            None,
            add_special_tokens=False,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )


        # Return the input IDs and label as PyTorch tensors
        return {
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0],
            # 'token_type_ids': inputs['token_type_ids'][0],
            'label': torch.tensor(label, dtype=torch.int64),
        }

# datasetclass = CustomDataset(tokenizer, train)
train_dataset = CustomDataset(tokenizer, train_df)
val_dataset = CustomDataset(tokenizer, val_df)
# test_dataset = CustomDataset(tokenizer, test)

# DataLoader
batch_size = 256
train_dataloader = tqdm(DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=64))
val_dataloader = tqdm(DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=64))
# test_dataloader = tqdm(DataLoader(test_dataset, batch_size=batch_size, shuffle=False))

  0%|          | 0/2134 [00:00<?, ?it/s]
  0%|          | 0/142 [17:32:46<?, ?it/s]


In [39]:
class TransformerModel(nn.Module):
    def __init__(self, num_labels):
        super(TransformerModel, self).__init__()
        
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        # self.Longformer = LongformerModel.from_pretrained("allenai/longformer-base-4096")
        # Freeze all layers except the top 1
        for param in self.roberta.parameters():
            param.requires_grad = False

        # Unfreeze the parameters of the top 1 layers
        for param in self.roberta.encoder.layer[-1:].parameters():
            param.requires_grad = True
        
        # self.xlnet.resize_token_embeddings(num_tokens)
        # self.transformer_encoder = TransformerEncoder(TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads), num_layers=num_layers)
        #self.transformer_decoder = TransformerDecoder(TransformerDecoderLayer(d_model=hidden_size, nhead=num_heads), num_layers=num_layers)
        #self.transformer = Transformer(nhead=16, num_encoder_layers=6, num_decoder_layers = 6)
        self.decoder = nn.Linear(self.roberta.config.hidden_size, num_labels) 
        # self.fc1 = nn.Linear(num_tokens, 2)
        # self.fc2 = nn.Linear(num_tokens, 2)
        # self.fc3 = nn.Linear(num_tokens, 5)
        # self.num_classes = num_classes
        # self.classifiers = nn.ModuleList([nn.Linear(self.roberta.config.hidden_size, num_classes[i]) for i in range(len(num_classes))])
        # self.classifiers = nn.ModuleList([nn.Linear(num_tokens, num_classes[i]) for i in range(len(num_classes))])
        # self.tanh = nn.Tanh()

    def forward(self, input_ids, attention_mask):  # src = [bsz, seq_len]
        long_output = self.roberta(input_ids=input_ids).pooler_output
        # print(long_output.shape)
        # roberta_outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # last_hidden_state = outputs.last_hidden_state # Shape: (batch_size, sequence_len, hidden_size)
        # src_embedded = last_hidden_state
        # src_embedded = self.roberta.embeddings(src) # Use RoBERTa model to embed source sequence output: [bsz, seq_len, features,i.e. hidden_dim] [20, 100, 768]
        # print("shape of roberta embeddings:", src_embedded.shape)
        #tgt_embedded = self.roberta.embeddings(tgt) # Use RoBERTa model to embed target sequence
        # src_embedded = src_embedded # output: [bsz, seq_len, features] 
        # src_embedded = torch.cat([t1,t2,t3, src_embedded],1)

        # t1 = torch.cat(src_embedded.size(0) * [t1])
        # t2 = torch.cat(src_embedded.size(0) * [t2])
        # t3 = torch.cat(src_embedded.size(0) * [t3])
        # t = torch.stack([t1,t2,t3], dim=1)
        # task_embedded = torch.cat([t, src_embedded],1)  # output shape: [bsz, seq_len, features] [8, 203, 768]

        # memory = self.transformer_encoder(src_embedded)  # output shape: [bsz, seq_len, features] [8, 203, 768]
        # print("shape after transformer encoder layer:", memory.shape)
        #output = self.transformer_decoder(tgt_embedded, memory)
        #print("shape after transformer decoder layer:", output.shape)

        output = self.decoder(long_output)  # output shape: [bsz, seq_len, vocab_size] [8, 203, 50k]
        # print("shape after transformer decoder layer:", output.shape, output.dtype)
        # task1_output = self.fc1(output[:,0,:])
        # task2_output = self.fc2(output[:,1,:])
        # task3_output = self.fc3(output[:,2,:num_classes])
        # ae_output = output[:,len(self.num_classes):,:]
        # ae_output = output[:,:,:]
        # print("shape after final linear layer:", output.shape)
        # task_logits = [classifier(pooled_output) for classifier in self.classifiers]
        # task_logits = []

        # pooled_outputs = [output[:,i,:] for i in range(len(self.num_classes))] # output shape : [bsz, 1, vocab_size]

        # for classifier, pooled_output in zip(self.classifiers, pooled_outputs):
        #     # pooled_output = self.tanh(pooled_output)
        #     logits = classifier(pooled_output)
        #     task_logits.append(logits)
        
        return output

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')


In [40]:
num_labels = 5

model = TransformerModel(num_labels).to(device)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
# num_epochs = 5
learning_rate = 2e-4
class_weights = torch.tensor([12.0, 1.0, 1.0, 8.0, 11.0]).to(device)

# Set optimizer and learning rate scheduler
criterion = nn.CrossEntropyLoss(weight=class_weights).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [42]:
def get_labels(logit, targets):
    """
    Calculate accuracy and macro F1 score for each class
    """
    # pos = list(task_dict.keys()).index(task_name)
    # mask = torch.arange(targets.shape[0]).to(device)
    # task_idx = mask[targets[:,pos] != 99]
    output = logit
    true_label = targets
    # print("shapes for label:", output.shape, true_label.shape)
    pred_label = torch.argmax(output, dim=1).flatten().tolist()
    true_label = true_label.flatten().tolist()


    return pred_label, true_label

In [43]:
def select_loss_contributing_samples(model, train_dataloader, num_samples):
    model.eval()
    samples_losses = []
    with torch.no_grad():
        for batch, i in enumerate(train_dataloader):
                data, mask, targets = i.values()
                data = data.to(device)
                mask = mask.to(device)
                targets = targets.to(device)
                # print(data.dtype)        
                # print(data.shape)
                # task_logits, ae_output = model(data)
                output = model(data, mask)
                # t1_out, t2_out, t3_out, auto_output = model(data, t1, t2, t3)
                # loss = custom_loss(logits_task1, logits_task2, logits_task3, targets)
                # print("shape:", data.shape, targets.flatten().shape)
                # print("outputtype:", output.dtype, targets.flatten().dtype)
                # print(output)
                # targets = targets.float()
                loss = criterion(output, targets.flatten())
                # print(loss)
                samples_losses.append(loss.cpu().tolist())

    # Get indices of samples with highest losses
    sorted_indices = np.argsort(samples_losses)[::-1]  # Sort in descending order
    selected_indices = sorted_indices[:num_samples]

    return selected_indices

In [46]:
train_df.iloc[[1,2]]

Unnamed: 0,text,label
1,upon the track of the bear. After following it...,1
2,to pull him out with their hands--even had the...,1


In [47]:
def active_learning(epoch, model, dataloader, batch_size):
    # Perform active learning to select a subset of samples from the unlabeled pool set

    # Select subset of samples to label based on active learning strategy
    selected_indices = select_loss_contributing_samples(model, dataloader, num_samples=20000 - epoch*1500)
    selected_samples_df = train_df.iloc[selected_indices].copy()

    # Create a new training dataset and dataloader
    train_dataset = CustomDataset(tokenizer, selected_samples_df)

    # Create a new DataLoader for training on the labeled data
    train_dataloader = tqdm(DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=64))
    
    return train_dataloader


In [52]:
current_train_loss = []

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 1
    start_time = time.time()
    num_batches = len(train_dataset) // batch_size
    for batch, i in enumerate(train_dataloader):
        data, mask, targets = i.values()
        data = data.to(device)
        mask = mask.to(device)
        targets = targets.to(device)
        # print(data.dtype)        
        # print(data.shape)
        # task_logits, ae_output = model(data)
        output = model(data, mask)
        # t1_out, t2_out, t3_out, auto_output = model(data, t1, t2, t3)
        # loss = custom_loss(logits_task1, logits_task2, logits_task3, targets)
        # print("shape:", data.shape, targets.flatten().shape)
        # print("outputtype:", output.dtype, targets.flatten().dtype)
        # print(output)
        # targets = targets.float()
        loss = criterion(output, targets.flatten())


        optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            # cur_loss = total_loss / log_interval
            # ppl = np.exp(cur_loss)
            # print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
            #         f'lr {lr:02.7f} | ms/batch {ms_per_batch:5.2f} | '
            #         f'loss {cur_loss:5.5f}')
            total_loss = 0
            start_time = time.time()
        
        # if batch == 100:
        #     break
    # current_train_loss.append(cur_loss)


In [53]:
def evaluate(model: nn.Module) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    # src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        predictions = []
        true_labels = []
        for batch, i in enumerate(val_dataloader):
            data, mask, targets = i.values()
            data = data.to(device)
            mask = mask.to(device)
            targets = targets.to(device)
            seq_len = data.size(1)
            # logits_task1, logits_task2, logits_task3, ae_output = model(data, mask)
            # task_logits, ae_output = model(data)
            # task_logits, ae_output = model(data, mask)
            output = model(data, mask)
            # t1_out, t2_out, t3_out, auto_output = model(data, t1, t2, t3)
            # loss = custom_loss(logits_task1, logits_task2, logits_task3, targets)
            # loss = custom_loss(logits_task1, logits_task2, logits_task3, ae_output, data, targets)
            loss = criterion(output, targets.flatten())

            total_loss += seq_len * loss.item()

            #get the labels for classification report
            pred_label, true_label = get_labels(output, targets)
            predictions.extend(pred_label)
            true_labels.extend(true_label)
            # if batch == 100:
            #     break

    # Compute overall classification report
    logging.info(f"\n Scores:")
    acc = accuracy_score(true_labels, predictions)
    logging.info(f"\n Accuracy:{acc}")
    logging.info(f"\n {classification_report(true_labels, predictions)}")
    return total_loss / (len(val_dataset) - 1), acc


In [54]:
logging.info(f"#"* 89)
logging.info(f"\n DESCRIPTION-> \n logic: roberta(finetune last layer) + linear_layer + loss_reweighting (epochs=50), model: {tokenizer.name_or_path}, lr:{learning_rate}, max_seq_length: {max_length}")
logging.info('#' * 89)

2023-07-16 17:07:19,293 - INFO - #########################################################################################
2023-07-16 17:07:19,295 - INFO - 
 DESCRIPTION-> 
 logic: roberta(finetune last layer) + linear_layer + loss_reweighting (epochs=50), model: roberta-base, lr:0.0002, max_seq_length: 500
2023-07-16 17:07:19,296 - INFO - #########################################################################################


In [55]:
best_val_acc = 0
current_val_loss = []   # for plotting graph of val_loss
epochs = 50
early_stop_thresh = 3

tempdir = '/data1/debajyoti/colie/.temp/'
best_model_params_path = os.path.join(tempdir, f"best_model_params_{time.asctime().replace(' ','_')}.pt")

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()

    # Generate a new DataLoader for the current epoch
    train_dataloader = active_learning(epoch, model, train_dataloader, batch_size)
    train(model)
    val_loss, accuracy = evaluate(model)
    current_val_loss.append(val_loss)
    # val_ppl = np.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    logging.info('-' * 89)
    logging.info(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
        f'valid loss {val_loss:5.5f}')
    logging.info('-' * 89)

    if accuracy > best_val_acc:
        best_val_acc = accuracy
        best_epoch = epoch
        torch.save(model.state_dict(), best_model_params_path)
    elif epoch - best_epoch > early_stop_thresh:
        logging.info("Early stopped training at epoch %d" % epoch)
        break  # terminate the training loop

    scheduler.step()
model.load_state_dict(torch.load(best_model_params_path)) # load best model states


100%|██████████| 1/1 [00:12<00:00, 12.10s/it]
2023-07-16 17:12:29,024 - INFO - 
 Scores:
2023-07-16 17:12:29,056 - INFO - 
 Accuracy:0.4671649612488623
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2023-07-16 17:12:29,099 - INFO - 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      1158
           1       0.47      1.00      0.64     16938
           2       0.00      0.00      0.00     14848
           3       0.00      0.00      0.00      1713
           4       0.00      0.00      0.00      1600

    accuracy                           0.47     36257
   macro avg       0.09      0.20      0.13     36257
weighted avg       0.22      0.47      0.30     36257

2023-07-16 17:12:29,101 - INFO - -----------------------------------------------------------------------------------------
2023-07-16 17:12:29,102 - INFO - | e

<All keys matched successfully>

In [56]:
test_df

Unnamed: 0,text
0,"""Alas, poor girl!"" said I, ""I fear that her ha..."
1,to divide her attention between the said garco...
2,"visitor's disposition to gallantry. However, s..."
3,"says Juvenal, ""'Mors sola fatetur Quantula sin..."
4,him out in that back passage; the outer door i...
...,...
143025,"be hard for anyone to do anything dignified, w..."
143026,"Wilson, and could not bring himself to think t..."
143027,"give them a chance, the same as everybody else..."
143028,political convention in which he declared that...


In [57]:
class CustomDataset_test(Dataset):
    def __init__(self, tokenizer, df):
        # Initialize the tokenizer
        self.tokenizer = tokenizer

        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Get the text and label from the dataframe
        text = self.df.iloc[index]['text']
        # label = self.df.iloc[index]['label']

        # Tokenize the text and convert it to input IDs
        inputs = self.tokenizer(
            text,
            None,
            add_special_tokens=False,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )


        # Return the input IDs and label as PyTorch tensors
        return {
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0],
            # 'token_type_ids': inputs['token_type_ids'][0],
            # 'label': torch.tensor(label, dtype=torch.int64),
        }

test_dataset = CustomDataset_test(tokenizer, test_df)

# DataLoader
test_dataloader = tqdm(DataLoader(test_dataset, batch_size=batch_size, shuffle=False))

  0%|          | 0/559 [00:00<?, ?it/s]

In [58]:
predictions = []

# Evaluate model on test data
model.eval()
with torch.no_grad():
    for batch_idx, batch in enumerate(test_dataloader):
        data, mask = batch.values()
        data = data.to(device)
        mask = mask.to(device)
        # targets = targets.to(device)
        seq_len = data.size(1)
        # logits_task1, logits_task2, logits_task3, ae_output = model(data, mask)
        # task_logits, ae_output = model(data)
        # task_logits, ae_output = model(data, mask)
        output = model(data, mask)
        #get the labels for classification report
        pred_label = torch.argmax(output, dim=1).flatten().tolist()
        predictions.extend(pred_label)
        
        # if batch == 400:
        #     break

# # Compute overall classification report
# logging.info(f"\n Scores:")
# logging.info(f"\n {classification_report(true_labels, predictions)}")
# return total_loss / (len(val_dataset) - 1)

100%|██████████| 559/559 [28:23<00:00,  3.05s/it]


In [59]:
test_labels["Epoch"] = predictions
# test_labels.to_csv('file_name.csv', index=False)

In [60]:
label_dic = {0:'Romanticism',
            1:'Viktorian',
            2:'Modernism',
            3:'PostModernism',
            4:'OurDays'}
test_labels['Epoch'] = test_labels['Epoch'].map(label_dic)
test_labels

Unnamed: 0,BOOK_id,Epoch
0,7616_1.txt,Viktorian
1,7616_2.txt,Viktorian
2,7616_3.txt,Viktorian
3,7616_4.txt,Viktorian
4,7616_5.txt,Viktorian
...,...,...
143025,5677_92.txt,Viktorian
143026,5677_93.txt,Viktorian
143027,5677_94.txt,Viktorian
143028,5677_95.txt,Viktorian


In [61]:
test_labels.to_csv('/data1/debajyoti/colie/submission/submission_4.csv', index=False)

In [None]:
# num_samples=100

# best_model_params_path = "/data1/debajyoti/colie/.temp/best_model_params_Thu_Jul_13_19:57:09_2023.pt"
# model.load_state_dict(torch.load(best_model_params_path)) # load best model states
# model.eval()
# predictions = []
# with torch.no_grad():
#     for batch_idx, batch in enumerate(val_dataloader):
#         data, mask, targets = batch.values()
#         data = data.to(device)
#         mask = mask.to(device)
#         targets = targets.to(device)
#         seq_len = data.size(1)
#         # logits_task1, logits_task2, logits_task3, ae_output = model(data, mask)
#         # task_logits, ae_output = model(data)
#         # task_logits, ae_output = model(data, mask)
#         output = model(data, mask)
#         #get the labels for classification report
#         # pred_label = torch.argmax(output, dim=1).flatten().tolist()
#         # predictions.extend(pred_label)
#         probabilities = torch.softmax(output, dim=1)
#         predictions.extend(probabilities.tolist())

#     # # Calculate uncertainty scores based on predictions
#     # uncertainty_scores = np.max(predictions, axis=1)

#     # # Get indices of samples with lowest uncertainty scores
#     # sorted_indices = np.argsort(uncertainty_scores)
#     # selected_indices = sorted_indices[:num_samples]
#     # print(selected_indices)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
# uncertainty_scores = np.max(predictions, axis=1)

# # Get indices of samples with lowest uncertainty scores
# sorted_indices = np.argsort(uncertainty_scores)
# sorted_indices

array([34547, 21129,  9801, ..., 20397, 20375, 20439])

In [None]:
# np.sort(uncertainty_scores)

array([0.23686005, 0.24098375, 0.24175839, ..., 0.99999774, 0.99999917,
       0.99999952])