In [1]:
labels = ["O", "I-BRAND", "B-BRAND", "B-PRODUCT","I-PRODUCT", "I-PRICE","B-PRICE", "B-LOC", "I-LOC"]  
num_labels = len(labels)

In [2]:
import shap
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForTokenClassification


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the pre-trained model and tokenizer
model_name = 'xlm-roberta-base'  # Change this if you're using a different model
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForTokenClassification.from_pretrained(model_name, num_labels=num_labels)  # Set number_of_labels accordingly

# Move the model to the appropriate device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768

In [4]:
import pandas as pd
labled_data=pd.read_csv("./data/datalabeled_messages_output.csv")
labled_data.head()

Unnamed: 0,Formatted_Labeled_Message
0,dell 0 I-BRAND\ng15 ...
1,asus 0 I-BRAND\nzeyph...
2,from 0 O\nneva ...
3,may 0 O\n2017 ...
4,from 0 O\nneva ...


In [5]:
def process_labeled_message(message):
    
    if not isinstance(message, str):
        return pd.DataFrame(columns=['Token', 'Position', 'Label'])
    
   
    lines = message.split('\n')
    
   
    tokens = []
    positions = []
    labels = []
    
    for line in lines:
      
        parts = line.split()
        if len(parts) == 3:  
            token = parts[0]
            position = parts[1]
            label = parts[2]
            
            tokens.append(token)
            positions.append(position)
            labels.append(label)
    
    return pd.DataFrame({'Token': tokens, 'Position': positions, 'Label': labels})


processed_dfs = labled_data['Formatted_Labeled_Message'].apply(process_labeled_message)


result_df = pd.concat(processed_dfs.values, ignore_index=True)


result_df.head()

Unnamed: 0,Token,Position,Label
0,dell,0,I-BRAND
1,g15,5,O
2,156,1,O
3,full,5,O
4,hd,10,O


In [6]:
import pandas as pd

# Assuming result_df contains individual tokens and their labels
# Create a new DataFrame to hold sentences and their labels
sentence_data = []

# Iterate through each row in result_df
for index, row in result_df.iterrows():
    # Append the sentence and its corresponding labels
    sentence_data.append({
        'Sentence': ' '.join(row['Token']),  # Join tokens to form a sentence
        'Labels': ' '.join(row['Label'])  # Join corresponding labels
    })

# Create a DataFrame from the collected sentences
sentence_df = pd.DataFrame(sentence_data)

# Print the resulting DataFrame
print(sentence_df)
print(len(sentence_df))  # Length of the DataFrame, i.e., number of sentences


                     Sentence             Labels
0                     d e l l      I - B R A N D
1                       g 1 5                  O
2                       1 5 6                  O
3                     f u l l                  O
4                         h d                  O
...                       ...                ...
145758            l a p t o p  B - P R O D U C T
145759                  a n d                  O
145760  a c c e s s o r i e s                  O
145761                c a l l                  O
145762    0 9 1 2 7 5 9 9 0 0                  O

[145763 rows x 2 columns]
145763


In [7]:
from datasets import Dataset

# Convert the sentence DataFrame to a Hugging Face Dataset
huggingface_dataset = Dataset.from_pandas(sentence_df)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['Sentence'], truncation=True, padding='max_length', return_tensors="pt")

    # Initialize labels for tokenized inputs
    labels = []
    
    for i, label_sequence in enumerate(examples['Labels']):
        label_ids = []
        label_sequence = label_sequence.split()
        
        # Create a mapping from token to label
        label_ids_map = {word: idx for idx, word in enumerate(label_sequence)}

        # Align labels with tokens
        for token in tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids'][0].tolist()):
            if token in label_ids_map:
                label_ids.append(label_ids_map[token])  
            else:
                label_ids.append(-100)  # Use -100 for tokens that don't have a label
            
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Apply the tokenization function to the dataset
tokenized_dataset = huggingface_dataset.map(tokenize_and_align_labels, batched=True)

# Check the tokenized dataset
print(tokenized_dataset)


Map: 100%|██████████| 145763/145763 [01:29<00:00, 1637.28 examples/s]

Dataset({
    features: ['Sentence', 'Labels', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 145763
})





: 

In [8]:
from transformers import XLMRobertaForTokenClassification, Trainer, TrainingArguments

# Load the pre-trained model
model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=len(set(result_df['Label'])))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # For initial testing, use the same dataset
)

# Train the model
trainer.train()





Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/27333 [00:00<?, ?it/s]

In [None]:
import shap

# Define a function to predict NER labels
def predict_proba(texts):
    # Tokenize the input texts
    tokenized_inputs = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
    inputs = tokenized_inputs.to(device)  # Ensure it's on the right device (CPU or GPU)

    # Make predictions using the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Softmax to get probabilities for each class
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    return predictions.cpu().numpy()  # Convert back to numpy for SHAP

# Create a SHAP explainer
explainer = shap.Explainer(predict_proba, tokenizer)

# Select sample texts for explanation (choose some difficult cases or samples)
sample_texts = ['This is a challenging sentence where John and Google are mentioned.']
shap_values = explainer(sample_texts)

# Plot the SHAP values for each sample text
for i in range(len(sample_texts)):
    shap.plots.text(shap_values[i])
