In [1]:
from datasets import load_dataset

dataset = load_dataset("wikiann", "en",cache_dir="./cache_2")

In [24]:
import os
import argparse
import json
from tqdm import tqdm


import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# set TOKENIZERS_PARALLELISM so that it doesn't annoy us
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def parse_arguments():
    parser = argparse.ArgumentParser(description='Train a model on the SNLI dataset')
    parser.add_argument('--model', type=str, default='bert-base-multilingual-cased', help='The model to use')
    parser.add_argument('--batch_size', type=int, default=32, help='The batch size')
    parser.add_argument('--epochs', type=int, default=3, help='The number of epochs to train')
    parser.add_argument('--lr', type=float, default=1e-4, help='The learning rate')
    parser.add_argument('--seed', type=int, default=42, help='The random seed')
    parser.add_argument('--warmup_steps', type=int, default=50, help='The number of warmup steps')
    parser.add_argument('--gradient_clipping', type=float, default=10.0, help='The gradient clipping value')
    return parser.parse_args([])

args = parse_arguments()

# set random seed
torch.manual_seed(args.seed)

# set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    args.model,
    cache_dir="./cache"
)

cpu


In [25]:
from smart_open import open
import os
import pandas as pd
import numpy as np
from operator import itemgetter
import nltk 

nltk.download('punkt') 

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

def find_substring_index(substring_list, string_list):
    indexes = []
    for substring in substring_list:
        try:
            index = next(i for i, string in enumerate(string_list) if substring in string)
            indexes.append(index)
        except StopIteration:
            indexes.append(-1)
    return indexes

def import_func(sub_string_list,current_dir):
    dir_path = current_dir
    
    # Construct the base path for the data directory
    dir_path = os.path.join(dir_path, "data")
    
    files = os.listdir(dir_path)

    list_paths = [os.path.join(dir_path, files[i]) for i in range(len(files))]
    match_on = find_substring_index(sub_string_list, list_paths)
    print(match_on)

    path_get = itemgetter(*match_on)
    get_paths = path_get(list_paths)
    return get_paths

def read_file_to_df(path_in_list):
    print(path_in_list)
    df_list = []
    for path_in in path_in_list: 
        with open(path_in, "rb") as f:
            df = pd.read_csv(f, sep="\t", header=0)
            df_list.append(df)
    return df_list
current_directory = os.getcwd()
paths_to_print=import_func(["train-en.tsv.gz","train-it.tsv.gz","train-gu.tsv.gz"],current_directory)
first_doc=read_file_to_df(paths_to_print)[0]

train_text_col = first_doc.iloc[:, 0].to_string(index=False)
train_text=' '.join(train_text_col.split())

sent_tokenizing=sent_tokenize(train_text)

train_lab_col = first_doc.iloc[:, 1].to_string(index=False)
train_labels=' '.join(train_lab_col.split())

train_text_list=train_text_col.split()
train_label_list=train_lab_col.split()




[nltk_data] Downloading package punkt to /fp/homes01/u01/ec-
[nltk_data]     eirikeg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[78, 63, 70]
('/fp/homes01/u01/ec-eirikeg/mandatory_2/data/train-en.tsv.gz', '/fp/homes01/u01/ec-eirikeg/mandatory_2/data/train-it.tsv.gz', '/fp/homes01/u01/ec-eirikeg/mandatory_2/data/train-gu.tsv.gz')


In [26]:
print(train_text_list[1:30])
print(train_label_list[1:30])

['(', 'St.', 'Lawrence', 'River', ')', '(', '968', 'MW', ')', ';', "'", "''", 'Anders', 'Lindström', "''", "'", 'Karl', 'Ove', 'Knausgård', '(', 'born', '1968', ')', 'Atlantic', 'City', ',', 'New', 'Jersey', 'Her']
['O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O']


In [27]:
def tokenize_text_and_labels(text_list, labels_list):
    """
    Tokenize the input text into sentences and preserve the one-to-one correspondence
    between tokens and labels.

    Args:
    - text (str): The input text to tokenize.
    - labels (str): The corresponding labels for each word in the text.

    Returns:
    - tokenized_texts (list): The tokenized sentences.
    - tokenized_labels (list): The corresponding labels for each token.
    """

    #joined_text=" ".join(text)
    # Tokenize sentences
    #sentences = nltk.sent_tokenize(joined_text)

    # Initialize lists to store tokenized text and labels
    tokenize_nested_text=[]
    tokenize_nested_label=[]
    tokenized_texts = []
    tokenized_labels = [] 
    # Keep track of the current index in the tokenized text
    current_index = 0
    len_text_list=len(text_list)
    count_for=0
    # Iterate through sentences
    for i,word in enumerate(text_list):
        count_for+=1
        
            
        
        if current_index<=10:
            
            tokenized_texts.append(word)
            
            if ("," in word or "." in word) and len(word)==1:
               
            # Append the corresponding label to the tokenized labels
               tokenized_labels.append("O")
               current_index=current_index 
            else:
                tokenized_labels.append(labels_list[i])
                current_index+=1
            
        if current_index>10:
            current_index=0
            tokenized_texts = []
            tokenized_labels = [] 
        
        if current_index==10:
            tokenize_nested_text.append(tokenized_texts)
            tokenize_nested_label.append(tokenized_labels)
        
        if count_for==len_text_list:
           break
       
        
    return tokenize_nested_text, tokenize_nested_label


train_text_list=train_text_col.split()
train_label_list=train_lab_col.split()

text_sent,label_sent=tokenize_text_and_labels(train_text_list, train_label_list)



In [28]:
from sklearn.preprocessing import LabelEncoder
import itertools
all_labels = list(itertools.chain.from_iterable(label_sent))
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)
# Apply LabelEncoder to each sublist
numerical_NER = [list(label_encoder.transform(sublist_list)) for sublist_list in label_sent]


In [29]:
from itertools import chain

numeric_flatt_unique=list(np.unique(np.array(list(chain.from_iterable(numerical_NER)))))
all_labels_flatt_unique=list(np.unique(np.array(all_labels)))
print(all_labels_flatt_unique)
print(numeric_flatt_unique)


['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']
[0, 1, 2, 3, 4, 5, 6]


In [30]:
list(np.unique(np.array(all_labels)))

['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']

In [35]:
def map_NE_to_GNE(all_labels_flatt_unique,unique_NER):
    unique_NER=[string[2:] for string in all_labels_flatt_unique if len(string)>2]
    
    unique_NER=set(unique_NER)
    unique_NER=list(unique_NER)
 
    index_NER_genral=[[numeric_flatt_unique[j]   
       for j, all_labels_per in enumerate(all_labels_flatt_unique) if unique_ner in all_labels_per] 
       for i,unique_ner in  enumerate(unique_NER) 
      ]
    index_NER_genral.append([numeric_flatt_unique[-1]])
    
    unique_NER=unique_NER+["O"]
    
    dict_map_NER={unique_ner:index_ner_genral for unique_ner,index_ner_genral in zip(unique_NER,index_NER_genral)}
    return dict_map_NER

map=map_NE_to_GNE(all_labels_flatt_unique,numeric_flatt_unique)

#create intervalls for target and prediction by using this to find indexes matching equal 
#label bought for target and predict vector to evaluate accuracy and F1.Reduce the len of evaluation vector
#classify binary target = np.ones and prdict 010101 depending on accuracy



In [32]:
# map=list(map.values())

In [33]:
from datasets import Dataset
from transformers import AutoTokenizer
import pandas as pd
# Example nested lists of tokenized texts and their corresponding labels


# Create a list of dictionaries where each dictionary represents an example


data = {"tokens": text_sent, "ner_tags": numerical_NER,}
    

# Create a dataset from the list of dictionaries
#df = pd.DataFrame(data)

# Create a dataset from the Pandas DataFrame
data = Dataset.from_dict(data)

# Instantiate tokenizer
#tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased",dir_cache="./cache_2")

# Function to tokenize and adjust labels
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], is_split_into_words=True)
  #tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used 
  #so the new keys [input_ids, labels (after adjustment)]
  #can be added to the datasets dict for each train test validation split
  total_adjusted_labels = []
  print(len(tokenized_samples["input_ids"]))
  for k in range(0, len(tokenized_samples["input_ids"])):
    prev_wid = -1
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    existing_label_ids = all_samples_per_split["ner_tags"][k]
    #print(existing_label_ids)
      
    i = -1
    adjusted_label_ids = []
   
    for wid in word_ids_list:
      if(wid is None):
        adjusted_label_ids.append(-100)
      elif(wid!=prev_wid):
        i = i + 1
        adjusted_label_ids.append(existing_label_ids[i])
        prev_wid = wid
      else:
        label_name = all_labels[existing_label_ids[i]]
        adjusted_label_ids.append(existing_label_ids[i])
        
    total_adjusted_labels.append(adjusted_label_ids)
  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples


# Tokenize and adjust labels for the entire dataset
tokenized_data = data.map(tokenize_adjust_labels, batched=True)


Map:   0%|          | 0/14518 [00:00<?, ? examples/s]

1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
518


In [36]:
#tokenized_data[2]['labels']
#dataa=list(tokenized_data[2]['labels'])
#indices_to_remove=[0,len(tokenized_data[2]['labels'])-1]
from itertools import groupby
def convert_id_to_NE(all_samples):
    
    input_labels,input_tokens=all_samples['labels'],all_samples["tokens"]
    
    token_span=[]
    NER_span=[]
    for i in range(0, len(all_samples['labels'])):
        indices_to_remove=[0,len(input_labels)-1]
        new_list = [value for i, value in enumerate(input_labels[i]) if i not in indices_to_remove]
        
        result = [[key for key, values in map.items() if isinstance(values, list) and value in values ] 
                  for value in new_list]
        
        flatt_result=list(chain.from_iterable(result))
        
        grouped_tokens = [list(group) for key, group in groupby(zip(input_tokens[i], flatt_result), lambda x: x[1])]
        
        # Concatenate tokens for each group
        token_sequences = [' '.join([token for token, _ in group]) for group in grouped_tokens]
        new_sequence = [elem for i, elem in enumerate(flatt_result) if i == 0 or elem != flatt_result[i - 1]]
        
        token_span.append(token_sequences )
        NER_span.append(new_sequence)
    all_samples['NER_span']=token_span
    all_samples['token_span']=NER_span
    return all_samples

#convert_id_to_NE(tokenized_data)

tokenized_data_span = tokenized_data.map(convert_id_to_NE, batched=True)


Map:   0%|          | 0/14518 [00:00<?, ? examples/s]

In [None]:
import transformers
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import accelerate
from accelerate import Accelerator
wandb.init(mode="disabled")
# Define the accelerator
accelerator = Accelerator()
model = AutoModelForSequenceClassification.from_pretrained(
    args.model,
    cache_dir="./cache",
    trust_remote_code=True,
    num_labels=len(all_labels_flatt_unique)
)

for name, param in model.named_parameters():
    if "classifier" not in name:  # Assuming "classifier" is the name of your classification head
        param.requires_grad = False

# Verify the trainable parameters
#for name, param in model.named_parameters():
    #print(name, param.requires_grad)

from transformers import DataCollatorForTokenClassification

training_args = TrainingArguments(
    output_dir="./fine_tune_bert_output",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=100,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps = 1000,
    run_name = "ep_10_tokenized_11",
    save_strategy='no'
   
)

data_collator = DataCollatorForTokenClassification(tokenizer)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
  
)

trainer.train()



In [None]:
first_row_ner_tags = tokenized_data["ner_tags"][0]
all_equal = all(tag == first_row_ner_tags for tag in tokenized_data["ner_tags"])


In [None]:

len(tokenizer.convert_ids_to_tokens(tokenized_data[30]['input_ids']))
len(tokenized_data[30]['labels'])

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels_flatt_unique[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels_flatt_unique[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
      if(k not in flattened_results.keys()):
        flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results


In [None]:

#print(all_labels)
print(f"{tokenized_data[3]['tokens']} \n {tokenized_data[3]['ner_tags']}")
tokenizer.decode(tokenized_data[3]['input_ids'])