In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AdamW, get_scheduler
from tqdm.notebook import tqdm
import warnings

warnings.filterwarnings("ignore")

# Check if CUDA is available and set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [2]:
# Read the CSV file
df = pd.read_csv("normalization_assesment_dataset_10k.csv")
print("Dataset shape: ", df.shape)
df.head()

Dataset shape:  (10000, 2)


Unnamed: 0,raw_comp_writers_text,CLEAN_TEXT
0,Jordan Riley/Adam Argyle/Martin Brammer,Jordan Riley/Adam Argyle/Martin Brammer
1,Martin Hygård,Martin Hygård
2,Jesse Robinson/Greg Phillips/Kishaun Bailey/Ka...,Jesse Robinson/Greg Phillips/Kishaun Bailey/Ka...
3,Mendel Brikman,
4,Alvin Lee,Alvin Lee


In [3]:
df["CLEAN_TEXT"] = df["CLEAN_TEXT"].fillna("")

In [4]:
# def handle_na(df):
#     # Calculate null values for each column
#     null_counts = df.isnull().sum()
#     # Calculate percentage of null values
#     null_percentages = (null_counts / len(df)) * 100
#     print("null_percentages", null_percentages)
#     if all(null_percentages[column] for column in df.columns) and df.shape[0] >= 10000:
#         new_df = df.dropna()
#         print(new_df.shape)
#         return new_df
#     return df


# df = handle_na(df)
# df.head()

In [5]:
from sklearn.model_selection import train_test_split

# First split: separate test set
X_temp, X_test, Y_temp, Y_test = train_test_split(df["raw_comp_writers_text"].values, df["CLEAN_TEXT"].values, test_size=0.2, random_state=42)

# Second split: separate validation set from remaining data
X_train, X_val, Y_train, Y_val = train_test_split(X_temp, Y_temp, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

In [6]:
print(f"\nTraining samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")


Training samples: 6000
Validation samples: 2000
Test samples: 2000


In [7]:
class TextNormalizationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        """
        dataset handler for text normalization
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # we add 'normalize' as a task-specific prefix
        input_text = f"normalize: {self.texts[idx]}"

        # Tokenize input and target texts
        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        target_encoding = self.tokenizer(
            self.labels[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": input_encoding["input_ids"].flatten(),
            "attention_mask": input_encoding["attention_mask"].flatten(),
            "labels": target_encoding["input_ids"].flatten(),
        }

Available T5 Model Variants
Original T5 Family

- t5-small (60M parameters)
- t5-base (220M parameters)
- t5-large (770M parameters)
- t5-3b (3B parameters)
- t5-11b (11B parameters)

Flan-T5 (Instruction-tuned)

- flan-t5-small
- flan-t5-base
- flan-t5-large
- flan-t5-xl (3B parameters)
- flan-t5-xxl (11B parameters)

mT5 (Multilingual)

- mt5-small
- mt5-base
- mt5-large
- mt5-xl
- mt5-xxl


In [8]:
model_used = "t5-small"
# Initialize tokenizer
print("\nInitializing tokenizer...")
tokenizer = T5Tokenizer.from_pretrained(model_used)  # legacy= False

# Create datasets and dataloaders
train_dataset = TextNormalizationDataset(X_train, Y_train, tokenizer, max_length=32)
val_dataset = TextNormalizationDataset(X_val, Y_val, tokenizer, max_length=32)
test_dataset=TextNormalizationDataset(X_test, Y_test, tokenizer, max_length=32)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader=DataLoader(test_dataset, batch_size=16, shuffle=True)


Initializing tokenizer...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
class TextNormalizer:
    def __init__(
        self,
        model_type=model_used,
        device=device,
    ):
        """
        main model function with train,evaluate, plot functions
        """
        self.device = device
        print(f"Loading {model_type} model...")
        self.model = T5ForConditionalGeneration.from_pretrained(model_type).to(device)
        self.tokenizer = T5Tokenizer.from_pretrained(model_type)
        print("Model and tokenizer loaded successfully")

    def train(self, train_loader, val_loader, epochs=3, lr=3e-5, patience=3):
        """
        use trainloader for the data.
        training  with lr scheduler
        """
        optimizer = AdamW(self.model.parameters(), lr=lr)

        # Scheduler
        num_training_steps = epochs * len(train_loader)
        num_warmup_steps = int(0.1 * num_training_steps)  # 10%
        scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

        best_val_loss = float("inf")
        patience_counter = 0
        best_model_state = None

        for epoch in range(epochs):
            print(f"\nEpoch {epoch + 1}/{epochs}")
            # training loop
            self.model.train()
            train_loss = 0
            progress_bar = tqdm(train_loader, desc="Training")

            for batch in progress_bar:
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                labels = batch["labels"].to(self.device)
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                train_loss += loss.item()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()

                progress_bar.set_postfix({"loss": loss.item()})

            # epoch validation
            val_loss = self.evaluate(val_loader)

            # Print epoch statistics
            avg_train_loss = train_loss / len(train_loader)
            print(f"Average training loss: {avg_train_loss:.4f}")
            print(f"Average validation loss: {val_loss:.4f}")

            # apply early stopping with patience
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                best_model_state = self.model.state_dict().copy()
                torch.save(best_model_state, "best_model14.pt")
                print("Saved best model checkpoint")
            else:
                patience_counter += 1
                print(f"Validation loss didn't improve. Patience: {patience_counter}/{patience}")

            if patience_counter >= patience:
                print(f"\nEarly stopping triggered after epoch {epoch + 1}")
                print(f"Best validation loss: {best_val_loss:.4f}")
                # load best model
                self.model.load_state_dict(best_model_state)
                break

    def evaluate(self, val_loader):
        """
        evulation function used in model validation
        """
        self.model.eval()
        val_loss = 0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                labels = batch["labels"].to(self.device)
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

        return val_loss / len(val_loader)

    def normalize_text(self, text):
        """
        Normalize a single text input
        """
        self.model.eval()

        # Prepare input
        input_text = f"normalize: {text}"
        inputs = self.tokenizer(
            input_text,
            return_tensors="pt",
            max_length=128,
            padding=True,
            truncation=True,
        ).to(self.device)

        # Generate output
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=128,
                num_beams=4,
                early_stopping=True,
            )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

In [10]:
# Initialize the model
normalizer = TextNormalizer()

Loading t5-small model...
Model and tokenizer loaded successfully


In [11]:
normalizer.train(train_loader, val_loader, epochs=30, patience=2, lr=3e-5)  # 3e-5


Epoch 1/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 5.6452
Average validation loss: 0.6251
Saved best model checkpoint

Epoch 2/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 1.1589
Average validation loss: 0.5468
Saved best model checkpoint

Epoch 3/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.5265
Average validation loss: 0.2128
Saved best model checkpoint

Epoch 4/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.2201
Average validation loss: 0.1448
Saved best model checkpoint

Epoch 5/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.1699
Average validation loss: 0.1267
Saved best model checkpoint

Epoch 6/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.1486
Average validation loss: 0.1156
Saved best model checkpoint

Epoch 7/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.1365
Average validation loss: 0.1092
Saved best model checkpoint

Epoch 8/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.1290
Average validation loss: 0.1047
Saved best model checkpoint

Epoch 9/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.1205
Average validation loss: 0.1018
Saved best model checkpoint

Epoch 10/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.1163
Average validation loss: 0.0995
Saved best model checkpoint

Epoch 11/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.1131
Average validation loss: 0.0968
Saved best model checkpoint

Epoch 12/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.1079
Average validation loss: 0.0945
Saved best model checkpoint

Epoch 13/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.1049
Average validation loss: 0.0936
Saved best model checkpoint

Epoch 14/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.1039
Average validation loss: 0.0916
Saved best model checkpoint

Epoch 15/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0994
Average validation loss: 0.0908
Saved best model checkpoint

Epoch 16/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0979
Average validation loss: 0.0897
Saved best model checkpoint

Epoch 17/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0956
Average validation loss: 0.0893
Saved best model checkpoint

Epoch 18/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0925
Average validation loss: 0.0885
Saved best model checkpoint

Epoch 19/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0916
Average validation loss: 0.0875
Saved best model checkpoint

Epoch 20/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0904
Average validation loss: 0.0874
Saved best model checkpoint

Epoch 21/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0891
Average validation loss: 0.0866
Saved best model checkpoint

Epoch 22/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0886
Average validation loss: 0.0861
Saved best model checkpoint

Epoch 23/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0868
Average validation loss: 0.0859
Saved best model checkpoint

Epoch 24/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0857
Average validation loss: 0.0857
Saved best model checkpoint

Epoch 25/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0840
Average validation loss: 0.0859
Validation loss didn't improve. Patience: 1/2

Epoch 26/30


Training:   0%|          | 0/375 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Average training loss: 0.0833
Average validation loss: 0.0858
Validation loss didn't improve. Patience: 2/2

Early stopping triggered after epoch 26
Best validation loss: 0.0857


In [12]:
# Test the model with some examples
test_examples = [
    "Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING (UK) LIMITED",
    "<Unknown>/Wright, Justyce Kaseem",
    "Pixouu/Abdou Gambetta/Copyright Control",
]

print("\nTesting the model with examples:")
for text in test_examples:
    normalized = normalizer.normalize_text(text)
    print(f"\nInput: {text}")
    print(f"Output: {normalized}")


Testing the model with examples:

Input: Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING (UK) LIMITED
Output: Mike Hoyer/JERRY CHESNUT/SONY

Input: <Unknown>/Wright, Justyce Kaseem
Output: Wright/Justyce Kaseem

Input: Pixouu/Abdou Gambetta/Copyright Control
Output: Pixouu/Abdou Gambetta


In [13]:
def calculate_accuracy(normalizer, test_loader):
    normalizer.model.eval()
    exact_matches = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Calculating metrics"):
            input_ids = batch["input_ids"].to(normalizer.device)
            attention_mask = batch["attention_mask"].to(normalizer.device)
            labels = batch["labels"]

            # Generate predictions
            outputs = normalizer.model.generate(
                input_ids=input_ids, attention_mask=attention_mask, max_length=128, num_beams=4, early_stopping=True, temperature=1.0, top_k=20, top_p=8.0, repetition_penalty=1.0
            )

            # Decode predictions and labels
            predictions = [normalizer.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            true_labels = [normalizer.tokenizer.decode(label, skip_special_tokens=True) for label in labels]

            # Calculate metrics
            exact_matches += sum(1 for pred, true in zip(predictions, true_labels) if pred == true)
            total += len(predictions)

    accuracy = exact_matches / total
    print(f"\nExact match accuracy: {accuracy:.4f}")
    return accuracy


In [14]:
accuracy = calculate_accuracy(normalizer, test_loader)

Calculating metrics:   0%|          | 0/125 [00:00<?, ?it/s]


Exact match accuracy: 0.6520


In [15]:
def calculate_exact_matches(predictions, true_labels):
    """
    Calculate the percentage of predictions that exactly match their true labels
    """
    matches = sum(1 for pred, true in zip(predictions, true_labels) if pred.strip() == true.strip())
    return matches / len(predictions)

In [16]:
from sklearn.metrics import f1_score
from collections import Counter


def calculate_token_f1(predictions, true_labels):
    """
    Calculate F1 score based on shared tokens between prediction and true label
    """

    def tokenize(text):
        # Split on common delimiters and create a set of tokens
        return set(token.strip() for token in text.replace("/", " ").split())

    all_true_tokens = []
    all_pred_tokens = []

    for pred, true in zip(predictions, true_labels):
        true_tokens = tokenize(true)
        pred_tokens = tokenize(pred)

        # Convert to binary presence/absence for each token
        all_tokens = true_tokens.union(pred_tokens)
        all_true_tokens.extend(1 if token in true_tokens else 0 for token in all_tokens)
        all_pred_tokens.extend(1 if token in pred_tokens else 0 for token in all_tokens)

    return f1_score(all_true_tokens, all_pred_tokens)

In [17]:
def calculate_order_score(predictions, true_labels):
    """
    Calculate how well the model preserves the correct order of names
    """

    def get_ordered_names(text):
        return [name.strip() for name in text.split("/")]

    correct_order = 0
    total_pairs = 0

    for pred, true in zip(predictions, true_labels):
        pred_names = get_ordered_names(pred)
        true_names = get_ordered_names(true)

        # Check relative ordering of each pair of names
        for i in range(len(true_names)):
            for j in range(i + 1, len(true_names)):
                if i < len(pred_names) and j < len(pred_names):
                    if pred_names[i] in true_names and pred_names[j] in true_names:
                        if true_names.index(pred_names[i]) < true_names.index(pred_names[j]):
                            correct_order += 1
                    total_pairs += 1

    return correct_order / total_pairs if total_pairs > 0 else 0

In [18]:
def evaluate_model(normalizer, val_loader):
    """
    Comprehensive evaluation of the text normalization model
    """
    normalizer.model.eval()
    all_predictions = []
    all_true_labels = []

    print("Generating predictions...")
    with torch.no_grad():
        for batch in tqdm(val_loader):
            # Generate predictions
            outputs = normalizer.model.generate(
                input_ids=batch["input_ids"].to(normalizer.device),
                attention_mask=batch["attention_mask"].to(normalizer.device),
                max_length=32,
                num_beams=4,
                early_stopping=True,
                temperature=1.0,
                top_k=20,
                top_p=8.0,
                repetition_penalty=1.0,
            )

            # Decode predictions and true labels
            predictions = [normalizer.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            true_labels = [normalizer.tokenizer.decode(label, skip_special_tokens=True) for label in batch["labels"]]

            all_predictions.extend(predictions)
            all_true_labels.extend(true_labels)

    # Calculate all metrics
    exact_accuracy = calculate_exact_matches(all_predictions, all_true_labels)
    token_f1 = calculate_token_f1(all_predictions, all_true_labels)
    order_score = calculate_order_score(all_predictions, all_true_labels)

    print("\nEvaluation Results:")
    print(f"Exact Match Accuracy: {exact_accuracy:.4f}")
    print(f"Token F1 Score: {token_f1:.4f}")
    print(f"Order Preservation Score: {order_score:.4f}")

## small

- best_model.pt lr 1e-5 max_length 32, batch 64, accuracy 0.7679 0.9339 0.7889
- best_model1.pt lr 1e-6 max_length 32, batch 64, accuracy 0.6697 0.8814 0.8814
- best_model2.pt lr 3e-5 max_length 128, batch 32, accuracy 0.7939 0.9249 0.7639
- best_model3.pt lr 3e-5 max_length 32, batch 64, accuracy 0.7771 0.9311 0.7676
- best_model4.pt lr 1e-4 max_length 32, batch 128, accuracy 0.7771 0.9311 0.7676
- best_model5.pt lr 5e-5 max_length 128, batch 16, accuracy 0.7858 0.9235 0.7408
- best_model6.pt lr 5e-5 max_length 32, batch 16, accuracy 0.7794 0.9317 0.7864
- best_model7.pt lr 1e-5 max_length 256, batch 16, accuracy 0.7639 0.9183 0.7532
- best_model13.pt lr 1e-4 max_length 32, batch 64, accuracy 0.6
- best_model14.pt lr 3e-4 max_length 64, batch 16, accuracy 0.6

## base

- best_model_11.pt lr 3e-5 max_length 16, batch 32, 0.6288 0.8761 0.4818

## google/mt5-small

- best_model8.pt lr 3e-5 max_length 16, batch 32, accuracy 0
- best_model12.pt lr 3e-5 max_length 16, batch 8, accuracy 0.33

## google/mt5-base

cant use it

## google/flan-t5-small

- best_model9.pt lr 3e-5 max_length 16, batch 32, accuracy 0.6189
- best_model10.pt lr 3e-5 max_length 16, batch 32, accuracy 0.6074 0.8714 0.4613
- na dokimaso megalytero lr kai epochs


In [19]:
evaluate_model(normalizer, test_loader)

Generating predictions...


  0%|          | 0/125 [00:00<?, ?it/s]


Evaluation Results:
Exact Match Accuracy: 0.6520
Token F1 Score: 0.8494
Order Preservation Score: 0.7527


In [20]:
def load_trained_model(model_path=None, model_type=None, device=device):

    normalizer = TextNormalizer(model_type=model_type, device=device)
    state_dict = torch.load(model_path, map_location=device)
    normalizer.model.load_state_dict(state_dict)
    normalizer.model.eval()
    print("Model loaded successfully!")

    return normalizer

In [21]:
# Load the trained model
normalizer = load_trained_model(model_path="best_model14.pt", model_type=model_used)

# Test single examples
test_texts = [
    "Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING",
    "<Unknown>/Wright, Justyce Kaseem",
    "Pixouu/Abdou Gambetta/Copyright Control",
    "Martin Hygård",
    "MISIA/松井寛",
    "Trần Quang Lộc",
    "Александр Степанов (Alexandr Stepanov),Артём Иванов (Artyom Ivanov)",
    "Oliv/김홍중/Peperoni/LEEZ/Ollounder/송민기/EDEN",
    "栗林みな実/菊田大介",
    "タブゾンビ",
    "Afroto - عفروتو",
    "ابو بكر سالم بلفقيه",
    "กะลา/หนุ่ม กะลา/ธนา ชัยวรภัทร์",
]

print("Testing individual examples:")
for text in test_texts:
    normalized = normalizer.normalize_text(text)
    print(f"\nInput: {text}")
    print(f"Normalized: {normalized}")

Loading t5-small model...
Model and tokenizer loaded successfully
Model loaded successfully!
Testing individual examples:

Input: Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING
Normalized: Mike Hoyer/JERRY CHESNUT/SONY

Input: <Unknown>/Wright, Justyce Kaseem
Normalized: Wright/Justyce Kaseem

Input: Pixouu/Abdou Gambetta/Copyright Control
Normalized: Pixouu/Abdou Gambetta

Input: Martin Hygård
Normalized: Martin Hygrd

Input: MISIA/松井寛
Normalized: MISIA/

Input: Trần Quang Lộc
Normalized: Trn Quang Lc

Input: Александр Степанов (Alexandr Stepanov),Артём Иванов (Artyom Ivanov)
Normalized: Alexandr Stepanov/Artyom Ivanov

Input: Oliv/김홍중/Peperoni/LEEZ/Ollounder/송민기/EDEN
Normalized: Oliv/Peperoni/LEEZ/Ollounder

Input: 栗林みな実/菊田大介
Normalized: 

Input: タブゾンビ
Normalized: 

Input: Afroto - عفروتو
Normalized: Afroto

Input: ابو بكر سالم بلفقيه
Normalized: 

Input: กะลา/หนุ่ม กะลา/ธนา ชัยวรภัทร์
Normalized: 


# CROSS-APPROACH EVALUATION

In [22]:
import json
test_texts = [
    "Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING",
    "<Unknown>/Wright, Justyce Kaseem",
    "Pixouu/Abdou Gambetta/Copyright Control",
    "Martin Hygård",
    "MISIA/松井寛",
    "Trần Quang Lộc",
    "Александр Степанов (Alexandr Stepanov),Артём Иванов (Artyom Ivanov)",
    "Oliv/김홍중/Peperoni/LEEZ/Ollounder/송민기/EDEN",
    "栗林みな実/菊田大介",
    "タブゾンビ",
    "Afroto - عفروتو",
    "ابو بكر سالم بلفقيه",
    "กะลา/หนุ่ม กะลา/ธนา ชัยวรภัทร์",
]
with open("script_stats.json", "r") as f:
    script_stats = json.load(f)
with open("keywords.json", "r") as f:
    keywords = json.load(f)

In [23]:
from heuristic_approach_2 import get_script_name
def heuristic_clean_2(text):
    with open("script_stats.json", "r") as f:
        script_stats = json.load(f)
    modified_text=text
    for char in text:
        if char=='/':
            continue
        script = get_script_name(char)
        if script_stats[script]['percentance']<0.5:
            modified_text=modified_text.replace(char,'')
    return modified_text

In [24]:
import re
def heuristic_clean(raw_text, keywords):
   """Clean text using heuristic rules"""
   import string
   
   # Remove keywords
   clean_text = raw_text
   for keyword in keywords:
       clean_text = clean_text.replace(keyword, '')
   
   # Replace & and comma with /
   clean_text = clean_text.replace(' & ', '/').replace('&', '/').replace(',', '/')
   
   # Replace punctuation except / with space
   trans = str.maketrans({p: ' ' for p in string.punctuation if p != '/'})
   clean_text = clean_text.translate(trans)
   
   # Replace multiple spaces with single space
   clean_text = ' '.join(clean_text.split())
   
   # Remove spaces around /
   clean_text = re.sub(r'\s*/\s*', '/', clean_text)

   return clean_text

In [25]:
model_preds=[]
heuristic_preds=[]
heuristic_preds_2=[]
for text in Y_test:
    model_preds.append(normalizer.normalize_text(text))
    heuristic_preds.append(heuristic_clean(text, keywords) )
    heuristic_preds_2.append(heuristic_clean_2(text) )

In [31]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


accuracy = [accuracy_score(Y_test, y_pred) for y_pred in [model_preds,heuristic_preds,heuristic_preds_2]]
f1 = [f1_score(Y_test, y_pred, average='weighted') for y_pred in [model_preds,heuristic_preds,heuristic_preds_2]]


print(f'Model accuracy: {accuracy[0]:.3f}')
print(f'Model F1 Score: {f1[0]:.3f}')
print(f'heuristic function 1 accuracy: {accuracy[1]:.3f}')
print(f'heuristic function 1 F1 Score: {f1[1]:.3f}')
print(f'heuristic function 2 accuracy: {accuracy[2]:.3f}')
print(f'heuristic function 2 F1 Score: {f1[2]:.3f}')

# some examples
print('\nExample predictions:')
for i in range(10):
   print(f'\nInput: {X_test[i]}')
   print(f'Expected: {Y_test[i]}')
   print(f'Predicted using the model: {model_preds[i]}')
   print(f'Predicted using heuristic function 1: {heuristic_preds[i]}')
   print(f'Predicted using heuristic function 2: {heuristic_preds_2[i]}')
   

Model accuracy: 0.728
Model F1 Score: 0.727
heuristic function 1 accuracy: 0.894
heuristic function 1 F1 Score: 0.894
heuristic function 2 accuracy: 1.000
heuristic function 2 F1 Score: 1.000

Example predictions:

Input: Endo Anaconda/Balts Nill
Expected: Endo Anaconda/Balts Nill
Predicted using the model: Endo Anaconda/Balts Nill
Predicted using heuristic function 1: Endo Anaconda/Balts Nill
Predicted using heuristic function 2: Endo Anaconda/Balts Nill

Input: Jessica Curry
Expected: Jessica Curry
Predicted using the model: Jessica Curry
Predicted using heuristic function 1: Jessica Curry
Predicted using heuristic function 2: Jessica Curry

Input: Peter Kelly/Andy Monaghan/Jill O'Sullivan
Expected: Peter Kelly/Andy Monaghan/Jill O'Sullivan
Predicted using the model: Peter Kelly/Andy Monaghan
Predicted using heuristic function 1: Peter Kelly/Andy Monaghan/Jill O Sullivan
Predicted using heuristic function 2: Peter Kelly/Andy Monaghan/Jill O'Sullivan

Input: Thomas Bergersen/Nick Phoe

In [27]:
print("model performance after applying heuristic function 1 to the inputs")
model_preds=[]
for text in heuristic_preds:
    model_preds.append(normalizer.normalize_text(text))

accuracy = accuracy_score(Y_test, model_preds)
f1 = f1_score(Y_test, model_preds, average='weighted') 


print(f'Model accuracy: {accuracy:.3f}')
print(f'Model F1 Score: {f1:.3f}')

model performance after applying heuristic function 1 to the inputs
Model accuracy: 0.728
Model F1 Score: 0.727


In [28]:
print("model performance after applying heuristic function 2 to the inputs")
model_preds=[]
for text in heuristic_preds_2:
    model_preds.append(normalizer.normalize_text(text))

accuracy = accuracy_score(Y_test, model_preds)
f1 = f1_score(Y_test, model_preds, average='weighted') 


print(f'Model accuracy: {accuracy:.3f}')
print(f'Model F1 Score: {f1:.3f}')

model performance after applying heuristic function 2 to the inputs
Model accuracy: 0.774
Model F1 Score: 0.773


In [29]:
print("model performance after applying both heuristic functions to the inputs")
model_preds=[]
for text in heuristic_preds:
    model_preds.append(normalizer.normalize_text(heuristic_clean_2(text) ))

accuracy = accuracy_score(Y_test, model_preds)
f1 = f1_score(Y_test, model_preds, average='weighted') 


print(f'Model accuracy: {accuracy:.3f}')
print(f'Model F1 Score: {f1:.3f}')

model performance after applying both heuristic functions to the inputs
Model accuracy: 0.728
Model F1 Score: 0.727
