In [190]:
test_file='/content/drive/MyDrive/Disfluency Task/Training_Data_Split_50/marathi/marathi_test_blind_50.tsv'
file_save_path='/content/sample_data/marathi_output.tsv'
model_checkpoint = "/content/drive/MyDrive/Disfluency Task/Model/Marathi-Bert"
batch_size = 16
max_sequence_length = 256
task = "disfluency"
epochs = 5

## Installation

In [None]:
!pip install datasets
!pip install transformers torch
!pip install sentencepiece
!pip install seqeval
!pip install transformers torch
!pip install sentencepiece
!pip install accelerate -U
!pip install transformers[torch]
!pip install transformers[sentencepiece]
!pip install scikit-learn

## Labels

In [191]:
label_list = [
  'O',
  'B-Alteration',
	'B-edit_R',
	'B-false_R',
	'B-filler_R',
	'B-pet_R',
	'B-repair_R',
	'B-repeat_R',
  'I-Alteration',
	'I-edit_R',
	'I-false_R',
	'I-filler_R',
	'I-pet_R',
	'I-repair_R',
	'I-repeat_R'
]

label2id = {label: index for index, label in enumerate(label_list)}
id2label = {index: label for index, label in enumerate(label_list)}

## Load Data

In [192]:
from datasets import Dataset

def read_file_to_dict_test(file_path):
    data = {'word': []}
    with open(file_path, 'r', encoding='utf-8') as file:
        current_entry = {'word': []}
        for line in file:
            line = line.strip()
            if not line:
                if current_entry['word']:
                    data['word'].append(current_entry['word'])
                    current_entry = {'word': []}
            else:
                tokens = line.split('\t')
                current_entry['word'].append(tokens[0])
        if current_entry['word']:
            data['word'].append(current_entry['word'])
    return data


In [193]:
test_dataset = Dataset.from_dict(read_file_to_dict_test(test_file))

In [194]:
# Renaming Columns
test_dataset = test_dataset.rename_column("word", "tokens")

In [195]:
test_dataset

Dataset({
    features: ['tokens'],
    num_rows: 191
})

# Tokenization

In [196]:
from transformers import pipeline, AutoModelForTokenClassification, TrainingArguments, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [197]:
args = TrainingArguments(
    output_dir=f"/content/sample_data/{task}-{model_checkpoint}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
)

In [198]:
from transformers import DataCollatorForTokenClassification, Trainer

data_collator = DataCollatorForTokenClassification(tokenizer)

In [199]:
trainer = Trainer(
    model,
    args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [200]:
def test_tokenize(examples):
  return tokenizer(examples["tokens"], is_split_into_words=True)

tokenized_test = test_dataset.map(test_tokenize, batched=True)

Map:   0%|          | 0/191 [00:00<?, ? examples/s]

In [201]:
tokenized_test

Dataset({
    features: ['tokens', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 191
})

In [202]:
predictions, labels, _ = trainer.predict(tokenized_test)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
predictions

## Prediction

In [204]:
import sklearn
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
import numpy as np
import pandas as pd

In [205]:
ps = np.argmax(predictions, axis=2)

In [206]:
true_predictions = [
    [label_list[p] for p  in prediction]
    for prediction in ps
]

In [207]:
tokensArr = []
for ttData in tokenized_test['input_ids']:
  tokensArr.append(tokenizer.convert_ids_to_tokens(ttData))

In [212]:
def combine_tokens(predictions):
    combined_result = []
    current_word = ""
    current_label = ""

    for token, label in predictions:
        if token.startswith("##"):
            current_word += token[2:]
        else:
            if current_word:
              combined_result.append(f"{current_word}\t{current_label}")
              current_word = ""
            current_word = token
            current_label = label

    if current_word:
        combined_result.append(f"{current_word}\t{current_label}")

    return combined_result


In [213]:
def remove_cls_sep(data):
    new_data = []
    for line in data:
        if "[CLS]" not in line and "[SEP]" not in line:
          new_data.append(line)
    return new_data

In [214]:
# Tokens with tags
arr = []
for i in range(len(tokensArr)):
  for (t, tp) in zip(tokensArr[i], true_predictions[i]):
    arr.append((t, tp))

with open(file_save_path, 'w') as f:
  for line in remove_cls_sep(combine_tokens(arr)):
    f.write(line + '\n')
