In [1]:
labels = ["O", "I-BRAND", "B-BRAND", "B-PRODUCT","I-PRODUCT", "I-PRICE","B-PRICE", "B-LOC", "I-LOC"]  
num_labels = len(labels)



In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "xlm-roberta-base"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import pandas as pd
labled_data=pd.read_csv("./data/datalabeled_messages_output.csv")
labled_data.head()

Unnamed: 0,Formatted_Labeled_Message
0,dell 0 I-BRAND\ng15 ...
1,asus 0 I-BRAND\nzeyph...
2,from 0 O\nneva ...
3,may 0 O\n2017 ...
4,from 0 O\nneva ...


In [4]:
def process_labeled_message(message):
    
    if not isinstance(message, str):
        return pd.DataFrame(columns=['Token', 'Position', 'Label'])
    
   
    lines = message.split('\n')
    
   
    tokens = []
    positions = []
    labels = []
    
    for line in lines:
      
        parts = line.split()
        if len(parts) == 3:  
            token = parts[0]
            position = parts[1]
            label = parts[2]
            
            tokens.append(token)
            positions.append(position)
            labels.append(label)
    
    return pd.DataFrame({'Token': tokens, 'Position': positions, 'Label': labels})


processed_dfs = labled_data['Formatted_Labeled_Message'].apply(process_labeled_message)


result_df = pd.concat(processed_dfs.values, ignore_index=True)


result_df.head()

Unnamed: 0,Token,Position,Label
0,dell,0,I-BRAND
1,g15,5,O
2,156,1,O
3,full,5,O
4,hd,10,O


In [5]:

sentence = ' '.join(result_df['Token'].tolist())
labels = ' '.join(result_df['Label'].tolist())


sentence_df = pd.DataFrame({'Sentence': [sentence], 'Labels': [labels]})

print(sentence_df)


                                            Sentence  \
0  dell g15 156 full hd 165hz core i7 11th genera...   

                                              Labels  
0  I-BRAND O O O O O O O O O O O O O O O O O O O ...  


In [19]:
label_list = result_df['Label'].explode().unique().tolist()  
label_to_id = {label: i for i, label in enumerate(label_list)}  
id_to_label = {i: label for label, i in label_to_id.items()}  
print(label_to_id)

{'I-BRAND': 0, 'O': 1, 'I-PRICE': 2, 'I-LOC': 3, 'B-PRODUCT': 4}


In [6]:
from transformers import XLMRobertaTokenizer
from datasets import Dataset

# Load the tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Convert the sentence DataFrame to a Hugging Face Dataset
huggingface_dataset = Dataset.from_pandas(sentence_df)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['Sentence'], truncation=True, padding='max_length', return_tensors="pt")

    # Initialize labels for tokenized inputs
    labels = []
    
    for i, label_sequence in enumerate(examples['Labels']):
        label_ids = []
        label_sequence = label_sequence.split()
        
        # Create a mapping from token to label
        label_ids_map = {word: idx for idx, word in enumerate(label_sequence)}

        # Align labels with tokens
        for token in tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids'][0].tolist()):
            if token in label_ids_map:
                label_ids.append(label_ids_map[token])  
            else:
                label_ids.append(-100)  
            
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Apply the tokenization function to the dataset
tokenized_dataset = huggingface_dataset.map(tokenize_and_align_labels, batched=True)

# Check the tokenized dataset
print(tokenized_dataset)


Map: 100%|██████████| 1/1 [00:02<00:00,  2.25s/ examples]

Dataset({
    features: ['Sentence', 'Labels', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1
})





In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy="epoch",     # Evaluation strategy
     learning_rate=1e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for logs
    logging_steps=10,                # Log every 10 steps
)




In [17]:
from transformers import XLMRobertaForTokenClassification, Trainer

# Load the pre-trained model
model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=len(set(result_df['Label'])))

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset, 
)

# Train the model
trainer.train()


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                              
 33%|███▎      | 1/3 [05:39<10:12, 306.29s/it]

{'eval_loss': nan, 'eval_runtime': 27.3174, 'eval_samples_per_second': 0.037, 'eval_steps_per_second': 0.037, 'epoch': 1.0}


 67%|██████▋   | 2/3 [07:36<03:34, 214.39s/it]
 67%|██████▋   | 2/3 [07:43<03:34, 214.39s/it]

{'eval_loss': nan, 'eval_runtime': 5.0113, 'eval_samples_per_second': 0.2, 'eval_steps_per_second': 0.2, 'epoch': 2.0}


100%|██████████| 3/3 [09:03<00:00, 156.29s/it]
100%|██████████| 3/3 [14:33<00:00, 156.29s/it]

{'eval_loss': nan, 'eval_runtime': 9.2132, 'eval_samples_per_second': 0.109, 'eval_steps_per_second': 0.109, 'epoch': 3.0}
{'train_runtime': 873.7076, 'train_samples_per_second': 0.003, 'train_steps_per_second': 0.003, 'train_loss': 0.0, 'epoch': 3.0}


100%|██████████| 3/3 [14:34<00:00, 291.58s/it]


TrainOutput(global_step=3, training_loss=0.0, metrics={'train_runtime': 873.7076, 'train_samples_per_second': 0.003, 'train_steps_per_second': 0.003, 'total_flos': 783911531520.0, 'train_loss': 0.0, 'epoch': 3.0})

In [18]:
evaluation_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation Results:")
for key, value in evaluation_results.items():
    print(f"{key}: {value}")

100%|██████████| 1/1 [00:01<00:00,  1.56s/it]

Evaluation Results:
eval_loss: nan
eval_runtime: 1.4424
eval_samples_per_second: 0.693
eval_steps_per_second: 0.693
epoch: 3.0





In [15]:
# Save the model
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')


('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\sentencepiece.bpe.model',
 './fine_tuned_model\\added_tokens.json')