In [20]:

!pip install transformers datasets seqeval evaluate -q
!pip install huggingface_hub --upgrade -q


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
!pip install --upgrade transformers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.1
    Uninstalling transformers-4.51.1:
      Successfully uninstalled transformers-4.51.1
Successfully installed transformers-4.51.3


data exemple ( mistral text beacuse i used it in the project ) 

In [28]:
import transformers
print(transformers.__version__)

4.51.1


In [None]:

import json


example_data = [
    {
        "text": "Toxic batteries must be disposed of in hazardous waste bins.",
        "entities": [
            {"start": 0, "end": 13, "label": "SAFETY"},
            {"start": 28, "end": 54, "label": "DISPOSAL"}
        ]
    },
    {
        "text": "Plastic containers should be recycled when possible.",
        "entities": [
            {"start": 28, "end": 36, "label": "RECYCLING"}
        ]
    },
    {
        "text": "Carbon emissions from landfills affect the environment badly.",
        "entities": [
            {"start": 0, "end": 23, "label": "ENVIRONMENTAL"},
            {"start": 47, "end": 68, "label": "ENVIRONMENTAL"}
        ]
    },
    {
        "text": "Always wear gloves when handling electronic waste.",
        "entities": [
            {"start": 7, "end": 20, "label": "SAFETY"},
            {"start": 26, "end": 48, "label": "HANDLING"}
        ]
    }
]

with open("ner_env_dataset.jsonl", "w") as f:
    for entry in example_data:
        json.dump(entry, f)
        f.write("\n")


In [None]:

from datasets import load_dataset

dataset = load_dataset("json", data_files="ner_env_dataset.jsonl", split="train")

# Inspect
dataset[0]


Generating train split: 0 examples [00:00, ? examples/s]

{'text': 'Toxic batteries must be disposed of in hazardous waste bins.',
 'entities': [{'start': 0, 'end': 13, 'label': 'SAFETY'},
  {'start': 28, 'end': 54, 'label': 'DISPOSAL'}]}

In [None]:

from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

label_list = ["O", "B-ENVIRONMENTAL", "B-DISPOSAL", "B-RECYCLING", "B-SAFETY", "B-HANDLING",
              "I-ENVIRONMENTAL", "I-DISPOSAL", "I-RECYCLING", "I-SAFETY", "I-HANDLING"]

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


def get_labels_for_tokens(offset_mapping, entities, seq_len):
    labels = ["O"] * seq_len
    for ent in entities:
        ent_start = ent["start"]
        ent_end = ent["end"]
        ent_label = ent["label"]
        found_first = False
        for i, (start, end) in enumerate(offset_mapping):
            if start is None or end is None or start == end:
                continue
            if start >= ent_start and end <= ent_end:
                if not found_first:
                    labels[i] = "B-" + ent_label
                    found_first = True
                else:
                    labels[i] = "I-" + ent_label
    return [label2id[label] for label in labels]


def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["text"],
                          truncation=True,
                          padding="max_length",
                          max_length=128,
                          return_offsets_mapping=True)

    offset_mapping = tokenized.pop("offset_mapping")
    labels = get_labels_for_tokens(offset_mapping, example["entities"], len(tokenized["input_ids"]))
    tokenized["labels"] = labels + [-100] * (128 - len(labels))  # Pad to max length

    return tokenized


tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [None]:

from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label_list), id2label=id2label, label2id=label2id)


2025-05-10 22:48:57.004830: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746917337.235401      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746917337.303099      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer


train_test = tokenized_dataset.train_test_split(test_size=0.2)

args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    logging_steps=500,
    disable_tqdm=False,  
    report_to="tensorboard",  
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_test["train"],
    eval_dataset=train_test["test"],
    tokenizer=tokenizer
)


trainer.train()


  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=10, training_loss=0.6226906776428223, metrics={'train_runtime': 39.6154, 'train_samples_per_second': 0.757, 'train_steps_per_second': 0.252, 'total_flos': 1959885135360.0, 'train_loss': 0.6226906776428223, 'epoch': 10.0})

In [None]:
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer


train_test = tokenized_dataset.train_test_split(test_size=0.2)


args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=100,
    weight_decay=0.01,
    do_eval=True,
    eval_steps=500,  
    save_steps=500,  
    logging_steps=500,  
    disable_tqdm=False,  
    report_to="tensorboard",  
    fp16=True,  
    gradient_accumulation_steps=2,  
    dataloader_num_workers=4,  
    warmup_steps=500,
    load_best_model_at_end=True, 
    save_total_limit=3,  
    save_strategy="steps",  
    eval_strategy="steps",  
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_test["train"],
    eval_dataset=train_test["test"],
    tokenizer=tokenizer
)


trainer.train()


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avo

Step,Training Loss,Validation Loss


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=100, training_loss=0.07706146240234375, metrics={'train_runtime': 350.0625, 'train_samples_per_second': 0.857, 'train_steps_per_second': 0.286, 'total_flos': 19598851353600.0, 'train_loss': 0.07706146240234375, 'epoch': 100.0})

In [None]:
from transformers import BertForTokenClassification, BertTokenizer
import torch


model = BertForTokenClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

)
text = """
1. Environmental Impact Analysis
Plastics have a significant environmental impact due to their persistence in the environment. They can take hundreds of years to decompose, leading to long-term pollution. Plastics often end up in oceans, harming marine life through ingestion and entanglement. They also contribute to microplastic pollution, which can enter the food chain and pose health risks to humans and wildlife.

2. Proper Disposal Methods
Landfilling: While common, landfilling plastics is not ideal due to their slow decomposition and potential to leach harmful chemicals.
Incineration: This method can generate energy but may release toxic emissions if not properly controlled.
Recycling: Preferred method to reduce environmental impact. Plastics should be cleaned and sorted according to their resin identification codes.
"""


inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)


with torch.no_grad():  
    outputs = model(**inputs)


predictions = torch.argmax(outputs.logits, dim=-1)


predicted_labels = predictions[0].cpu().numpy()


tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
for token, label in zip(tokens, predicted_labels):
    print(f"{token}: {label}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[CLS]: 1
1: 1
.: 1
environmental: 1
impact: 1
analysis: 0
plastics: 1
have: 1
a: 1
significant: 1
environmental: 1
impact: 1
due: 1
to: 1
their: 1
persistence: 1
in: 0
the: 1
environment: 1
.: 1
they: 1
can: 1
take: 0
hundreds: 1
of: 0
years: 1
to: 0
deco: 1
##mp: 1
##ose: 1
,: 0
leading: 1
to: 1
long: 1
-: 1
term: 1
pollution: 1
.: 1
plastics: 1
often: 1
end: 1
up: 1
in: 0
oceans: 1
,: 0
harm: 0
##ing: 0
marine: 1
life: 0
through: 0
ing: 1
##est: 1
##ion: 0
and: 1
en: 1
##tangle: 1
##ment: 1
.: 1
they: 1
also: 0
contribute: 1
to: 1
micro: 1
##pl: 1
##astic: 1
pollution: 0
,: 0
which: 0
can: 0
enter: 0
the: 0
food: 1
chain: 0
and: 0
pose: 1
health: 1
risks: 1
to: 0
humans: 1
and: 1
wildlife: 0
.: 1
2: 1
.: 1
proper: 1
disposal: 1
methods: 1
landfill: 1
##ing: 0
:: 0
while: 1
common: 1
,: 0
landfill: 1
##ing: 0
plastics: 1
is: 1
not: 0
ideal: 1
due: 1
to: 1
their: 1
slow: 1
decomposition: 0
and: 1
potential: 1
to: 0
leach: 0
harmful: 1
chemicals: 0
.: 1
inc: 1
##iner: 0
##ation: 1
:: 0


In [None]:
from transformers import BertTokenizer, BertForTokenClassification
import torch


model_name = 'bert-base-uncased'  
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)


text = """
1. Environmental Impact Analysis
Plastics have a significant environmental impact due to their persistence in the environment. 
They can take hundreds of years to decompose, leading to long-term pollution. Plastics often end up in oceans, 
harming marine life through ingestion and entanglement. They also contribute to microplastic pollution, 
which can enter the food chain and pose health risks to humans and wildlife.
"""


inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)


with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())

label_map = model.config.id2label

#  results
result = []


for token, label_id in zip(tokens, predictions.squeeze().tolist()):
    
    if token.startswith("##"):
        
        result[-1][0] += token[2:]
        result[-1][1] = label_map.get(label_id, 'Unknown')  
    else:
        result.append([token, label_map.get(label_id, 'Unknown')])


for token, label in result:
    print(f"{token}: {label}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[CLS]: LABEL_1
1: LABEL_0
.: LABEL_0
environmental: LABEL_1
impact: LABEL_1
analysis: LABEL_1
plastics: LABEL_1
have: LABEL_0
a: LABEL_0
significant: LABEL_1
environmental: LABEL_0
impact: LABEL_0
due: LABEL_0
to: LABEL_0
their: LABEL_0
persistence: LABEL_0
in: LABEL_0
the: LABEL_0
environment: LABEL_0
.: LABEL_1
they: LABEL_0
can: LABEL_0
take: LABEL_0
hundreds: LABEL_0
of: LABEL_0
years: LABEL_0
to: LABEL_0
decompose: LABEL_0
,: LABEL_1
leading: LABEL_0
to: LABEL_0
long: LABEL_0
-: LABEL_0
term: LABEL_0
pollution: LABEL_1
.: LABEL_1
plastics: LABEL_1
often: LABEL_0
end: LABEL_0
up: LABEL_0
in: LABEL_0
oceans: LABEL_0
,: LABEL_0
harming: LABEL_0
marine: LABEL_0
life: LABEL_0
through: LABEL_0
ingestion: LABEL_1
and: LABEL_0
entanglement: LABEL_1
.: LABEL_1
they: LABEL_0
also: LABEL_0
contribute: LABEL_0
to: LABEL_0
microplastic: LABEL_0
pollution: LABEL_1
,: LABEL_1
which: LABEL_0
can: LABEL_0
enter: LABEL_0
the: LABEL_0
food: LABEL_0
chain: LABEL_0
and: LABEL_0
pose: LABEL_0
health: L

In [46]:
import torch
model.save_pretrained("/kaggle/working/bert_model")
