### install important liberaries

In [3]:
# !pip install transformers datasets seqeval
# !pip install torch --upgrade

In [2]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# !pip install evaluate

## Import Liberaries

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
import evaluate

# Igrnore warrnings
import warnings
warnings.filterwarnings("ignore")

## 1. Load and Parse Our Dataset

In [74]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import numpy as np
from datasets import Dataset
import evaluate  # Corrected import for loading metrics

# Load the evaluation metric using the `evaluate` library
metric = evaluate.load("seqeval")

# Define the label mapping
label_map = {
    "O": 0,
    "B-PRODUCT": 1,
    "I-PRODUCT": 2,
    "B-PRICE": 3,
    "I-PRICE": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    # Add other labels as necessary
}

your_number_of_labels = len(label_map)
id_to_label = {i: label for label, i in label_map.items()}

# Define a function to parse your CoNLL formatted data
def parse_conll(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        current_sentence = []
        current_labels = []
        for line in f:
            line = line.strip()  # Remove leading/trailing whitespace
            if line:  # If the line is not empty
                try:
                    token, label = line.split()  # Split token and label
                    current_sentence.append(token)
                    current_labels.append(label)
                except ValueError:
                    print(f"Skipping line due to unpacking error: {line}")  # Log the error line
            else:  # End of a sentence
                if current_sentence:  # Check if there's a sentence to add
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                    current_sentence = []
                    current_labels = []
        # Add the last sentence if exists
        if current_sentence:
            sentences.append(current_sentence)
            labels.append(current_labels)

    # Convert to Dataset format
    return Dataset.from_dict({"tokens": sentences, "ner_tags": labels})

  ## 2. Convert to Dataset

In [75]:
# Load and preprocess the CoNLL dataset
file_path = "/content/cleaned_file.conll"
dataset = parse_conll(file_path)

In [76]:
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 22192
})

## 3. Tokenize and Align Labels

In [77]:
# Define a function to align labels with tokenized inputs
def tokenize_and_align_labels(examples, tokenizer, label_map):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        padding='max_length',        # Pad sequences to max_length
        truncation=True,             # Truncate sequences if they're too long
        max_length=128,              # Set a maximum length for padding
        is_split_into_words=True     # Since tokens are provided as word-level tokens
    )

    # Align labels with tokens
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)  # Padding token should have a special label
            elif word_idx != previous_word_idx:
                aligned_labels.append(label_map[label[word_idx]])  # Map label to the correct index
            else:
                aligned_labels.append(-100)  # Subword tokens will be masked for loss calculation
            previous_word_idx = word_idx
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

### Process dataset for training

In [78]:
# Process dataset for training
def process_dataset(dataset, tokenizer, label_map):
    # Map the tokenization and label alignment to the dataset
    return dataset.map(lambda examples: tokenize_and_align_labels(examples, tokenizer, label_map), batched=True)

## 4. Fine-tune Models

In [79]:
# Step 1: Split dataset into training and test sets
def process_and_split_dataset(dataset, tokenizer, label_map):
    # Tokenize and align labels for the dataset
    tokenized_datasets = dataset.map(lambda examples: tokenize_and_align_labels(examples, tokenizer, label_map), batched=True)

    # Split dataset into train and test sets (80% train, 20% test)
    split_dataset = tokenized_datasets.train_test_split(test_size=0.2)

    return split_dataset

In [81]:
# Generalized model training function
# Step 2: Generalized model training function with train/test split
def train_model(model_name, split_dataset, label_map, id_to_label):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_map))

    # Define compute_metrics function for evaluation
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
        pred_labels = [[id_to_label[p] for p, l in zip(prediction, label) if l != -100]
                       for prediction, label in zip(predictions, labels)]

        results = metric.compute(predictions=pred_labels, references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',  # Save results separately for each model
        evaluation_strategy="epoch",
        logging_strategy="steps",     # Log at regular intervals (steps)
        logging_steps=10,             # Log every 10 steps (adjust as needed)
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='/content/logs',  # Directory for storing logs
        report_to="none"  # Disable reporting to external services like WandB
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=split_dataset['train'],  # Pass the train dataset split
        eval_dataset=split_dataset['test'],    # Pass the test dataset split
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate the model
    evaluation_results = trainer.evaluate()
    # Save the model and tokenizer after evaluation
    model.save_pretrained(f'./models/{model_name}')
    tokenizer.save_pretrained(f'./models/{model_name}')

    print(f"{model_name} and tokenizer saved successfully!")
    return evaluation_results

In [82]:
# Step 3: Train multiple models with split datasets
def train_multiple_models(models_to_train, dataset, label_map, id_to_label):
    results = {}
    for model_name in models_to_train:
        print(f"\nTraining model: {model_name}\n")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Process and split dataset into train and test
        split_dataset = process_and_split_dataset(dataset, tokenizer, label_map)

        # Train model on the split dataset
        result = train_model(model_name, split_dataset, label_map, id_to_label)
        results[model_name] = result
    return results

In [83]:
# Define models to train
models_to_train = ["xlm-roberta-base", "distilbert-base-uncased", "bert-base-multilingual-cased"]

# Train multiple models and compare results
results = train_multiple_models(models_to_train, dataset, label_map, id_to_label)

# Print results for each model
for model_name, result in results.items():
    print(f"\nResults for model: {model_name}\n")
    print(result)


Training model: xlm-roberta-base



Map:   0%|          | 0/22192 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0302,0.025671,0.833333,0.853454,0.843274,0.992279
2,0.0171,0.016467,0.904314,0.934943,0.919373,0.995727
3,0.0124,0.014441,0.92064,0.945339,0.932826,0.996574


xlm-roberta-base and tokenizer saved successfully!

Training model: distilbert-base-uncased



Map:   0%|          | 0/22192 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0592,0.068677,0.728959,0.49009,0.586121,0.980018
2,0.0478,0.059061,0.815118,0.572251,0.672427,0.983379
3,0.0456,0.055294,0.839495,0.595269,0.696596,0.984391


distilbert-base-uncased and tokenizer saved successfully!

Training model: bert-base-multilingual-cased



Map:   0%|          | 0/22192 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0633,0.067634,0.719307,0.47808,0.574394,0.980032
2,0.0461,0.055603,0.784788,0.58112,0.66777,0.983963
3,0.0404,0.050679,0.83871,0.624,0.715596,0.985604


bert-base-multilingual-cased and tokenizer saved successfully!

Results for model: xlm-roberta-base

{'eval_loss': 0.014440957456827164, 'eval_precision': 0.9206401045068583, 'eval_recall': 0.9453386988598256, 'eval_f1': 0.9328259430840502, 'eval_accuracy': 0.9965744183257376, 'eval_runtime': 36.3649, 'eval_samples_per_second': 122.068, 'eval_steps_per_second': 7.645, 'epoch': 3.0}

Results for model: distilbert-base-uncased

{'eval_loss': 0.055294107645750046, 'eval_precision': 0.8394950405770965, 'eval_recall': 0.5952685421994884, 'eval_f1': 0.6965955854844742, 'eval_accuracy': 0.9843908135461269, 'eval_runtime': 20.3437, 'eval_samples_per_second': 218.2, 'eval_steps_per_second': 13.665, 'epoch': 3.0}

Results for model: bert-base-multilingual-cased

{'eval_loss': 0.050678953528404236, 'eval_precision': 0.8387096774193549, 'eval_recall': 0.624, 'eval_f1': 0.7155963302752294, 'eval_accuracy': 0.9856042765483999, 'eval_runtime': 37.1127, 'eval_samples_per_second': 119.609, 'eval_steps_

### Discussion:
- **xlm-roberta-base**: achieves the highest accuracy (0.996574), showing it is the most accurate model overall in terms of correct predictions.
- **bert-base-multilingual-cased** (0.985604) slightly outperforms distilbert-base-uncased (0.984391), but both are significantly lower than xlm-roberta-base.


## Model Testing by External data from telegram

In [101]:
import numpy as np
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Define model path and name
model_name = "xlm-roberta-base"
model_path = f'./models/{model_name}'

# Load the model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Define your label mapping (reverse the dictionary to map indices back to labels)
id_to_label = {
    0: "O",
    1: "B-PRODUCT",
    2: "I-PRODUCT",
    3: "B-PRICE",
    4: "I-PRICE",
    5: "B-LOC",
    6: "I-LOC",
    # Add other labels if necessary
}

# Sample test dataset
test_sentences = [
    "መገናኛ ዘፍመሽ ግራንድ ሞል 3 ተኛ ፎቅ ቁጥር 329 ቁጥር 2 :- ጀሞ 1 ከለላ ህንፃ ግራውንድ ለይ 2000 ብር G07"
]

# Function to predict entities
def predict_entities(model, tokenizer, sentences):
    inputs = tokenizer(sentences, padding=True, return_tensors="pt", truncation=True, is_split_into_words=False)

    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted indices for each token
    predictions = np.argmax(outputs.logits.detach().cpu().numpy(), axis=2)

    # Convert the indices back to labels
    predicted_labels = []
    for i, sentence in enumerate(sentences):
        tokens = tokenizer.tokenize(sentence)
        sentence_labels = [id_to_label[p] for p in predictions[i][:len(tokens)]]  # Match tokens to predictions
        predicted_labels.append(sentence_labels)

    return predicted_labels

# Example of predictions
predictions = predict_entities(model, tokenizer, test_sentences)

# Display predictions
for sentence, preds in zip(test_sentences, predictions):
    print(f"Sentence: '{sentence}' \n Predicted Entities: {preds}")


Sentence: 'መገናኛ ዘፍመሽ ግራንድ ሞል 3 ተኛ ፎቅ ቁጥር 329 ቁጥር 2 :- ጀሞ 1 ከለላ ህንፃ ግራውንድ ለይ 2000 ብር G07' 
 Predicted Entities: ['O', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PRICE', 'I-PRICE', 'O']


### Conclusion
- The model successfully identified location entities (B-LOC, I-LOC) for parts of the input sentence and price entities (B-PRICE, I-PRICE) for the amount mentioned.
- Some parts of the sentence, such as numbers and other terms, were marked as non-entity words (O).
- The results indicate that the model is capable of distinguishing between different types of entities (locations and prices), although further evaluation with more test data may be needed to assess its accuracy fully.

## Task-5 Model interpritablity

In [1]:
# !pip install shap lime

In [85]:
import torch

def predict_entities(model, tokenizer, text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2).numpy()
    return predictions[0], inputs['input_ids'][0]

In [86]:
from lime.lime_text import LimeTextExplainer

# Create an instance of the LIME text explainer
explainer = LimeTextExplainer(class_names=[id_to_label[label] for label in range(len(id_to_label))])

def lime_explanation(text, model, tokenizer):
    # Generate LIME explanations
    exp = explainer.explain_instance(text, lambda x: model.predict(x), num_features=10)
    return exp

In [87]:
import shap

def shap_explanation(text, model, tokenizer):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")
    # Get model predictions
    with torch.no_grad():
        output = model(**inputs)
    shap_values = shap.KernelExplainer(model.predict, inputs)(inputs)
    return shap_values

In [88]:
def analyze_difficult_cases(model, tokenizer, examples):
    difficult_cases = []
    for text in examples:
        predictions, _ = predict_entities(model, tokenizer, text)
        if not all(pred in [0, 1] for pred in predictions):  # Assuming 0 and 1 are your valid labels
            difficult_cases.append((text, predictions))
    return difficult_cases

In [89]:
def generate_report(difficult_cases, model, tokenizer):
    report = []
    for text, predictions in difficult_cases:
        lime_exp = lime_explanation(text, model, tokenizer)
        shap_values = shap_explanation(text, model, tokenizer)
        report.append({
            "text": text,
            "predictions": predictions,
            "lime_explanation": lime_exp,
            "shap_values": shap_values
        })
    return report

In [104]:
import numpy as np
import shap
from lime.lime_text import LimeTextExplainer
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Define model path and name
model_name = "xlm-roberta-base"
model_path = f'./models/{model_name}'

# Load the model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Define a function for SHAP
def explain_with_shap(model, test_data, tokenizer):
    # Tokenize test data
    inputs = tokenizer(test_data, return_tensors="pt", padding=True, truncation=True)
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = np.argmax(outputs.logits.detach().numpy(), axis=2)

    # Create a SHAP explainer
    explainer = shap.Explainer(model, tokenizer)
    shap_values = explainer(inputs['input_ids'])

    # Visualize the SHAP values for the first instance
    shap.initjs()
    shap.summary_plot(shap_values, feature_names=tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))

# Define a function for LIME
def explain_with_lime(model, test_data, tokenizer):
    explainer = LimeTextExplainer(class_names=['O', 'Product', 'Other'])  # Replace with your actual labels

    def predict_proba(texts):
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        return torch.softmax(outputs.logits, dim=1).numpy()

    # Select a text example to explain
    for text_example in test_data:  # Loop through test_data
        explanation = explainer.explain_instance(text_example, predict_proba, num_features=10)

        # Visualize the explanation
        explanation.show_in_notebook()

# Analyze difficult examples
def analyze_difficult_examples(model, test_dataset):
    difficult_examples = []  # Collect difficult examples based on your criteria
    for text in test_dataset:
        # Your logic to determine if an example is difficult
        if "ambiguous" in text or "overlapping" in text:  # Example condition
            difficult_examples.append(text)

    # Explain using SHAP and LIME
    for example in difficult_examples:
        print(f"Explaining example: {example}")
        explain_with_shap(model, example, tokenizer)
        explain_with_lime(model, example, tokenizer)

### ***Sorry! I have finished my limmited GPU and I can't run the model interpretiblity.***

## End