# El siguiente código contiene los pasos para:

### 1. Descargar los datos de entrenamiento, validación y test
### 2. Cargar los datos de modificados para identificar datos de fecha
### 3. Asignar la nueva estructura de datos con las nuevas etiquetas a los datasets modificados
### 4. Entrenar modelo con estos datasets modificados. El código de entrenamiento se toma del tutorial:
     ### https://github.com/laxmimerit/NLP-Tutorials-with-HuggingFace/tree/main

## Código de entreamiento del modelo "distilbert-tuned-4labels"


In [1]:
import pandas as pd
from datasets import load_dataset

## Se carga el dataset conllpp

In [2]:
data = load_dataset('conllpp',trust_remote_code=True)
data3= load_dataset('conllpp',trust_remote_code=True)
data


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

## Se descargan los datasets en formato csv

In [9]:
_t = load_dataset('conllpp',trust_remote_code=True,split='train')
_t.to_csv('train.csv')
_tt = load_dataset('conllpp',trust_remote_code=True,split='test')
_tt.to_csv('test.csv')
_v = load_dataset('conllpp',trust_remote_code=True,split='validation')
_v.to_csv('validation.csv')


Creating CSV from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

848380

## Se descarga a disco el dataset completo

In [3]:
data.save_to_disk("C:/User/NW/Documents")
# de esta manera se desgargan los datos en formato arrow (no utilizados en este proyecto)

Saving the dataset (0/1 shards):   0%|          | 0/14041 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3453 [00:00<?, ? examples/s]

## Se cargan los datasets modificados, identificando los datos de fecha

In [4]:

data_train = pd.read_csv('..csv_train_mod.csv')
data_test = pd.read_csv('..csv_test_mod.csv')
data_val = pd.read_csv('..csv_validation_mod.csv')

In [5]:
from datasets import Dataset, concatenate_datasets
from datasets import load_dataset, DatasetDict, ClassLabel, Features, Sequence, Value
data = DatasetDict({
    'train': data_train,
    'validation': data_val,
    'test': data_test
})
print(data)

DatasetDict({
    train:           id                                             tokens  \
    0          0  ['EU' 'rejects' 'German' 'call' 'to' 'boycott'...   
    1          1                              ['Peter' 'Blackburn']   
    2          2                          ['BRUSSELS' '1996-08-22']   
    3          3  ['The' 'European' 'Commission' 'said' 'on' 'Th...   
    4          4  ['Germany' "'s" 'representative' 'to' 'the' 'E...   
    ...      ...                                                ...   
    14036  14036                                ['on' 'Friday' ':']   
    14037  14037                                 ['Division' 'two']   
    14038  14038                     ['Plymouth' '2' 'Preston' '1']   
    14039  14039                               ['Division' 'three']   
    14040  14040                      ['Swansea' '1' 'Lincoln' '2']   
    
                                                    pos_tags  \
    0          ['22' '42' '16' '21' '35' '37' '16' '21' '7

## Agregar etiquetas B-DATE, I-DATE
### se agrega en Features del dataset conllpp ya cargado

In [6]:
from datasets import load_dataset, DatasetDict, ClassLabel, Features, Sequence, Value
original_labels = data3['train'].features['ner_tags'].feature

original_label_list = original_labels.names


# se agrega la nueva etiqueta de DATE
new_label_list = original_label_list + ['B-DATE', 'I-DATE']


# se define el nuevo ClassLabel con las etiquetas nuevas
new_class_label = ClassLabel(names=new_label_list)

# En ner_tags se asigna las nuevas etiquetas
new_features = Features({
    'id': Value(dtype='string'),
    'tokens': Sequence(Value(dtype='string')),
    'pos_tags': Sequence(Value(dtype='string')),
    'chunk_tags': Sequence(Value(dtype='string')),
    'ner_tags': Sequence(new_class_label),  # <--------las que van a varian son las de ner_tags
})

# Se genera un  nuevo dataset con las características actualizadas
def update_ner_tags(example, class_label):
    # Convertir etiquetas a los nuevos índices
    updated_labels = [class_label.str2int(label) for label in example['ner_tags']]
    example['ner_tags'] = updated_labels
    return example
    
data3 = data3.map(lambda x: update_ner_tags(x, new_class_label), batched=True)

# se actualiza las nuevas caracteristicas al dataset que se cargo conllpp
data3 = data3.cast(new_features)

# Mostrar el dataset actualizado
print(data3)
# muestra nuevas etiquetas
print(data3['train'].features['ner_tags'].feature)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-DATE', 'I-DATE'], id=None)


## transformación de archivos modificados en dataset asociados a feature

In [21]:
import pandas as pd
import re
from datasets import Dataset, Features, Sequence, Value, ClassLabel
import pandas as pd
import ast
from datasets import Dataset, Features, Sequence, Value, ClassLabel

carac=data3['train'].features
print(carac)
#dataset = dataset.cast(carac)

features = Features(carac)
print(features)
#---------------------------------------------
from json import JSONEncoder
import codecs, json 
from array import array

# Los 3 datasets modificados y cargados como data_train, data_test y data_val se tranforman en un tipo diccionario
lista = data_train.to_dict(orient='records') 
lista2 = data_test.to_dict(orient='records') 
lista3 = data_val.to_dict(orient='records') 

print('dict train [0]')
print(lista[0])

# Los diccionarios data_train, data_test y data_val se transforman en DataFrame
df_train = pd.DataFrame(lista)
df_test = pd.DataFrame(lista2)
df_val = pd.DataFrame(lista3)

def convertir_ner_tags(tags_str):
  
    # Eliminar caracteres no deseados
    cleaned_str = re.sub(r'[^\d\s]', '', tags_str)
    # Dividir la cadena en base a espacios y convertir a enteros
    return list(map(int, cleaned_str.split()))

def convertir_tokens(tokens_str):
    # Eliminar corchetes y comillas simples
    tokens_str = tokens_str.strip("[]")  
    tokens_str = tokens_str.replace("'", "")
    # Dividir por comas y espacios, asegurando que cada token esté separado correctamente
    tokens_list = [token.strip() for token in tokens_str.split()]
    return tokens_list
# Aplicar la función a la columna

# Convertir las columnas de listas de strings a listas reales
def convertir(df):
    df['tokens']   =      df['tokens'].apply(convertir_tokens)
    df['pos_tags'] =     df['pos_tags'].apply(convertir_ner_tags)
    df['chunk_tags'] = df['chunk_tags'].apply(convertir_ner_tags)
    df['ner_tags'] =     df['ner_tags'].apply(convertir_ner_tags)
    return df


print('df_train[0]')
print(df_train)
# SE REQUIERE PASAR DE ESTA ESTRUCTURA DE DATOS:-------------------

#tokes:      ['EU' 'rejects' 'German' 'call' 'to' 'boycott'...
#pos_tags': "['22' '42' '16' '21' '35' '37' '16' '21' '7']"
#ner_tags': '[3 0 7 0 0 0 7 0 0]'
# A ESTA ESTRUCTURA DE DATOS:--------------------------------------

#tokes:      ['EU', 'rejects', 'German', 'call', 'to', 'boycott'...
#pos_tags': "['22', '42', '16', '21', '35', '37','16', '21', '7']"
#ner_tags': '[3 0 7 0 0 0 7 0 0]'

d_train=convertir(df_train)
d_test=convertir(df_test)
d_val=convertir(df_val)

print('df_train tokens----------------------')
print(type(df_train['tokens'][0]))
print(d_train['tokens'][0])

# Convertit en Dataset train, test y val
dataset_train = Dataset.from_pandas(d_train)
print(dataset_train)
dataset_test = Dataset.from_pandas(d_test)
print(dataset_test)
dataset_val = Dataset.from_pandas(d_val)
print(dataset_val)



# Aplicar el esquema de características features del dataset inicial original ya modificado a este nuevo dataset
dataset_train = dataset_train.cast(features) 
dataset_test = dataset_test.cast(features) 
dataset_val = dataset_val.cast(features) 

dataset_train['tokens'][0]
dataset_train['ner_tags'][0]

{'id': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'pos_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'chunk_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-DATE', 'I-DATE'], id=None), length=-1, id=None)}
{'id': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'pos_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'chunk_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-DATE', 'I-DATE'], id=None), length=-1, id=None)}
dict train [0]
{'id': 0, 'tokens': "['EU' 'rejects' 'German' 'call' 'to' '

Casting the dataset:   0%|          | 0/14041 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3453 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3250 [00:00<?, ? examples/s]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

## combinar dataset
### se integran los datasets modificados de train, test y validación en un Dataset integrado

In [22]:
from datasets import Dataset, concatenate_datasets
data = DatasetDict({
    'train': dataset_train,
    'validation': dataset_val,
    'test': dataset_test
})
data
print(data)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [23]:
pd.DataFrame(data['train'][:])[['tokens', 'ner_tags']].iloc[0]

tokens      [EU, rejects, German, call, to, boycott, Briti...
ner_tags                          [3, 0, 7, 0, 0, 0, 7, 0, 0]
Name: 0, dtype: object

In [24]:
data['train']['tokens'][0]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [28]:
tags = data['train'].features['ner_tags'].feature

index2tag = {idx:tag for idx, tag in enumerate(tags.names)}
tag2index = {tag:idx for idx, tag in enumerate(tags.names)}

In [29]:
print(index2tag) # se verifica que existan las nuevas etiquetas B-DATE y I-DATE

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC', 9: 'B-DATE', 10: 'I-DATE'}


In [31]:
def create_tag_names(batch):
  tag_name = {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}
  return tag_name

In [32]:
data = data.map(create_tag_names)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [33]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 3453
    })
})

In [34]:
pd.DataFrame(data['train'][:])[['tokens', 'ner_tags', 'ner_tags_str']].iloc[0]

tokens          [EU, rejects, German, call, to, boycott, Briti...
ner_tags                              [3, 0, 7, 0, 0, 0, 7, 0, 0]
ner_tags_str            [B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]
Name: 0, dtype: object

## Construcción del modelo

### Tokenization

In [36]:
from transformers import AutoTokenizer

modelo = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(modelo)

In [133]:
tokenizer.is_fast

True

In [134]:
data['train'][1]

{'id': '1',
 'tokens': ['LONDON', '1996-08-30'],
 'pos_tags': ['22', '11'],
 'chunk_tags': ['11', '12'],
 'ner_tags': [5, 9],
 'ner_tags_str': ['B-LOC', 'B-DATE']}

In [135]:
inputs = data['train'][0]['tokens']
inputs = tokenizer(inputs, is_split_into_words=True)
print(inputs.tokens())

['[CLS]', 'CR', '##IC', '##KE', '##T', '-', 'L', '##EI', '##CE', '##ST', '##ER', '##S', '##H', '##IR', '##E', 'T', '##A', '##KE', 'O', '##VE', '##R', 'AT', 'TO', '##P', 'A', '##FT', '##ER', 'IN', '##NI', '##NG', '##S', 'VI', '##CT', '##OR', '##Y', '.', '[SEP]']


In [136]:
print(data['train'][0]['tokens'])
print(data['train'][0]['ner_tags_str'])


['CRICKET', '-', 'LEICESTERSHIRE', 'TAKE', 'OVER', 'AT', 'TOP', 'AFTER', 'INNINGS', 'VICTORY', '.']
['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [138]:
def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word=None
  for word_id in word_ids:
    if word_id != current_word:
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)

    elif word_id is None:
      new_labels.append(-100)

    else:
      label = labels[word_id]

      if label%2==1:
        label = label + 1
      new_labels.append(label)

  return new_labels


In [139]:
labels = data['train'][0]['ner_tags']
word_ids = inputs.word_ids()
print(labels, word_ids)


[0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0] [None, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, None]


In [140]:
align_labels_with_tokens(labels, word_ids)

[-100,
 0,
 0,
 0,
 0,
 0,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -100]

In [141]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

  all_labels = examples['ner_tags']

  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs['labels'] = new_labels

  return tokenized_inputs


In [142]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True, remove_columns=data['train'].column_names)

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [143]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [144]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

### Cotejo de datos y métricas

In [37]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [146]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[  101, 15531,  9741, 22441,  1942,   118,   149, 27514, 10954,  9272,
          9637,  1708,  3048, 18172,  2036,   157,  1592, 22441,   152, 17145,
          2069, 13020, 16972,  2101,   138, 26321,  9637, 15969, 27451, 11780,
          1708,  7118, 16647,  9565,  3663,   119,   102],
        [  101,   149, 11414,  2137, 11414,  1820,   118,  4775,   118,  1476,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    0,    0,    0,    0,    0,    3,    4,    4,    4,    4,    4,
            4,    4, 

### Metricas

In [147]:
#!pip install seqeval
#!pip install evaluate

import evaluate
metric = evaluate.load('seqeval')

In [148]:
ner_feature = data['train'].features['ner_tags']
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-DATE', 'I-DATE'], id=None), length=-1, id=None)

In [149]:
label_names = ner_feature.feature.names
label_names

['O',
 'B-PER',
 'I-PER',
 'B-ORG',
 'I-ORG',
 'B-LOC',
 'I-LOC',
 'B-MISC',
 'I-MISC',
 'B-DATE',
 'I-DATE']

In [150]:
labels = data['train'][0]['ner_tags']
labels = [label_names[i] for i in labels]
labels

['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [151]:
predictions = labels.copy()
predictions[2] = "O"

metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))


{'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.9090909090909091}

In [152]:
import numpy as np

def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {"precision": all_metrics['overall_precision'],
          "recall": all_metrics['overall_recall'],
          "f1": all_metrics['overall_f1'],
          "accuracy": all_metrics['overall_accuracy']}

### Entrenamiento del modelo

In [153]:
id2label = {i:label for i, label in enumerate(label_names)}
label2id = {label:i for i, label in enumerate(label_names)}

In [154]:
print(id2label)
print(label2id)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC', 9: 'B-DATE', 10: 'I-DATE'}
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8, 'B-DATE': 9, 'I-DATE': 10}


In [158]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
                                                    modelo,
                                                    id2label=id2label,
                                                    label2id=label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [207]:
model.config.id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC',
 9: 'B-DATE',
 10: 'I-DATE'}

In [204]:
from transformers import TrainingArguments
from transformers import TrainingArguments, get_linear_schedule_with_warmup
args = TrainingArguments("distilbert-finetuned-ner",
                         #evaluation_strategy = "epoch",
                         #save_strategy="epoch",
                         #learning_rate = 2e-5,
                         #num_train_epochs=10,
                         #weight_decay=0.01
                         
                        #evaluation_strategy="steps",
                        #save_strategy="steps",
                        #learning_rate=5e-5,  # Prueba una tasa de aprendizaje más alta
                        #num_train_epochs=15,  # Considera aumentar el número de épocas
                        #weight_decay=0.01,  # Ajusta el weight decay
                        #logging_dir='./logs',
                        #logging_steps=500,
                        #per_device_train_batch_size=16,  # Aumenta el tamaño del batch
                        #per_device_eval_batch_size=16,
                        #warmup_steps=500,  # Ajusta los pasos de warmup
                        #dropout=0.3,  # Considera ajustar la tasa de dropout
                        #load_best_model_at_end=True,
                        #metric_for_best_model="f1",
                        #early_stopping_patience=3  # Implementa early stopping
                         
                        evaluation_strategy="steps",
                        save_strategy="steps",
                        learning_rate=1e-5,  # Intenta una tasa de aprendizaje más baja
                        num_train_epochs=10,  # Ajusta según sea necesario
                        weight_decay=0.01,  # Revisa el valor para weight decay
                        logging_dir='./logs',
                        logging_steps=500,
                        per_device_train_batch_size=16,  # Ajusta el tamaño del batch según sea necesario
                        per_device_eval_batch_size=16,
                        warmup_steps=500,  # Ajusta los pasos de warmup
                        load_best_model_at_end=True,
                        metric_for_best_model="f1",
                           
 )  

In [198]:
from transformers import Trainer
trainer = Trainer(model=model,
                  args=args,
                  train_dataset = tokenized_datasets['train'],
                  eval_dataset = tokenized_datasets['validation'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,0.0011,4.008243,0.022533,0.108051,0.03729,0.533103
1000,0.0007,4.165921,0.022421,0.10452,0.036922,0.542314
1500,0.0005,4.323434,0.022608,0.109463,0.037476,0.523563
2000,0.0006,4.347162,0.022072,0.108051,0.036655,0.523344


TrainOutput(global_step=2040, training_loss=0.0007622575789105659, metrics={'train_runtime': 97.1459, 'train_samples_per_second': 334.548, 'train_steps_per_second': 20.999, 'total_flos': 451256342778408.0, 'train_loss': 0.0007622575789105659, 'epoch': 10.0})

In [None]:
#Pérdida (Loss)
#Pérdida de Entrenamiento: La pérdida de entrenamiento está disminuyendo lentamente, lo que es una señal de que el modelo está aprendiendo. Sin embargo, la tasa de disminución parece ser muy baja.
#Pérdida de Validación: La pérdida de validación está subiendo o se está manteniendo constante mientras que la pérdida de entrenamiento disminuye. Esto puede ser un signo de sobreajuste (overfitting), donde el modelo está aprendiendo bien los datos de entrenamiento pero no generaliza bien a los datos de validación.
#Métricas (Precision, Recall, F1, Accuracy)
#Precision: La precisión (precision) muestra valores bajos en todos los puntos de control, lo que sugiere que el modelo está identificando correctamente un pequeño porcentaje de las etiquetas positivas en comparación con el total de etiquetas positivas predichas.
#Recall: El recall también es bajo, indicando que el modelo está capturando solo un pequeño porcentaje de las etiquetas positivas reales.
#F1 Score: El F1 Score, que es la media armónica de precisión y recall, sigue un patrón similar al recall, y también es bajo.
#Accuracy: La precisión (accuracy) es una medida general de la proporción de predicciones correctas sobre el total de predicciones. En este caso, la precisión es moderadamente más alta en comparación con precisión y recall, pero aún no es ideal.

## Prueba del nuevo modelo entrenado cuyo nombre es distilbert-tuned-4labels

In [210]:
checkpoint = "distilbert-tuned-4labels"
token_classifier = pipeline(
    "token-classification", model=checkpoint, aggregation_strategy="simple"
)

token_classifier("My name is James Cameron. I work at Microsoft  and live in Berlin since April 21 1999")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'PER',
  'score': 0.99999475,
  'word': 'James Cameron',
  'start': 11,
  'end': 24},
 {'entity_group': 'ORG',
  'score': 0.99989915,
  'word': 'Microsoft',
  'start': 36,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9999927,
  'word': 'Berlin',
  'start': 59,
  'end': 65},
 {'entity_group': 'DATE',
  'score': 0.7863904,
  'word': 'April 21',
  'start': 72,
  'end': 80}]

## se almacena el modelo entrenado localmente

In [215]:
model.save_pretrained('distilbert-tuned-4labels')
tokenizer.save_pretrained('distilbert-tuned-4labels')

('distilbert-tuned-4labels\\tokenizer_config.json',
 'distilbert-tuned-4labels\\special_tokens_map.json',
 'distilbert-tuned-4labels\\vocab.txt',
 'distilbert-tuned-4labels\\added_tokens.json',
 'distilbert-tuned-4labels\\tokenizer.json')

## prueba pipeline con nuevo modelo local distilbert-tuned-date4

In [8]:
from transformers import pipeline
checkpoint = "distilbert-tuned-4labels"
token_classifier = pipeline(
    "token-classification", model=checkpoint, aggregation_strategy="simple"
)

token_classifier("My name is James Cameron. I work at Microsoft and live in Berlin since April 21 1999")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'PER',
  'score': 0.99999475,
  'word': 'James Cameron',
  'start': 11,
  'end': 24},
 {'entity_group': 'ORG',
  'score': 0.99989915,
  'word': 'Microsoft',
  'start': 36,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9999927,
  'word': 'Berlin',
  'start': 58,
  'end': 64},
 {'entity_group': 'DATE',
  'score': 0.7863904,
  'word': 'April 21',
  'start': 71,
  'end': 79}]

In [39]:
import pandas as pd
from json import JSONEncoder
def obtener_dataframe(data):
        
       
   data_flattened = [flatten_json(class_info) for class_info in data]
   df = pd.DataFrame(data_flattened)
            
   return df
    ###
    ### funcion "flatten_json" tomada de https://levelup.gitconnected.com/a-deep-dive-into-nested-json-to-data-frame-with-python-69bdabb41938 
    ### Renu Khandelwal Jul 23, 2023
def flatten_json(y):
        try:
            out = {}
    
            def flatten(x, name=''):
                if type(x) is dict:
                    for a in x:
                        flatten(x[a], name + a + '_')
                elif type(x) is list:
                    i = 0
                    for a in x:
                        flatten(a, name + str(i) + '_')
                        i += 1
                else:
                    out[name[:-1]] = x
    
            flatten(y)
            return out
        except json.JSONDecodeError:
            print("Error: The JSON document could not be decoded.")
        except TypeError:
            print("Error: Invalid operation or function argument type.")
        except KeyError:
            print("Error: One or more keys do not exist.")
        except ValueError:
            print("Error: Invalid value detected.")
        except Exception as e:
            # Catch any other exceptions
            print(f"An unexpected error occurred: {str(e)}")

In [10]:
from transformers import pipeline
checkpoint = "distilbert-tuned-4labels"
token_classifier = pipeline(
    "token-classification", model=checkpoint, aggregation_strategy="simple"
)

text="""So, if you're a NASA scientist, you should be able to tell me the whole story about the Face On Mars, which obviously is evidence that there is life on Mars, and that the face was created by aliens, correct?" No, twenty five years ago, our Viking 1 spacecraft was circling the planet, snapping photos, when it spotted the shadowy likeness of a human face. Us scientists figured out that it was just another Martian mesa, common around Cydonia, only this one had shadows that made it look like an Egyption Pharaoh. Very few days later, we revealed the image for all to see, and we made sure to note that it was a huge rock formation that just resembled a human head and face, but all of it was formed by shadows. We only announced it because we thought it would be a good way to engage the public with NASA's findings, and atrract attention to Mars-- and it did.

The face on Mars soon became a pop icon; shot in movies, appeared in books, magazines, radio talk shows, and haunted grocery store checkout lines for 25 years. Some people thought the natural landform was evidence of life on Mars, and that us scientists wanted to hide it, but really, the defenders of the NASA budget wish there was ancient civilization on Mars. We decided to take another shot just to make sure we weren't wrong, on April 5, 1998. Michael Malin and his Mars Orbiter camera team took a picture that was ten times sharper than the original Viking photos, revealing a natural landform, which meant no alien monument. "But that picture wasn't very clear at all, which could mean alien markings were hidden by haze" Well no, yes that rumor started, but to prove them wrong on April 8, 2001 we decided to take another picture, making sure it was a cloudless summer day. Malin's team captured an amazing photo using the camera's absolute maximum revolution. With this camera you can discern things in a digital image, 3 times bigger than the pixel size which means if there were any signs of life, you could easily see what they were. What the picture showed was the butte or mesa, which are landforms common around the American West."""

token_classifier(text)



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'ORG',
  'score': 0.9631609,
  'word': 'NASA',
  'start': 16,
  'end': 20},
 {'entity_group': 'MISC',
  'score': 0.9983225,
  'word': 'Face On Mars',
  'start': 88,
  'end': 100},
 {'entity_group': 'LOC',
  'score': 0.81088793,
  'word': 'Mars',
  'start': 152,
  'end': 156},
 {'entity_group': 'MISC',
  'score': 0.9847041,
  'word': 'Viking 1',
  'start': 240,
  'end': 248},
 {'entity_group': 'MISC',
  'score': 0.9998406,
  'word': 'Martian',
  'start': 407,
  'end': 414},
 {'entity_group': 'LOC',
  'score': 0.7726369,
  'word': 'Cydonia',
  'start': 435,
  'end': 442},
 {'entity_group': 'MISC',
  'score': 0.8863019,
  'word': 'Egyption Phara',
  'start': 496,
  'end': 510},
 {'entity_group': 'PER',
  'score': 0.7494665,
  'word': '##oh',
  'start': 510,
  'end': 512},
 {'entity_group': 'ORG',
  'score': 0.88716674,
  'word': 'NASA',
  'start': 801,
  'end': 805},
 {'entity_group': 'LOC',
  'score': 0.9616925,
  'word': 'Mars',
  'start': 843,
  'end': 847},
 {'entity

In [40]:
# agrupacion de salida de modelo nuevo entrenado
import json
with open("json_entrenado.json", encoding='utf-8') as f:
    data = json.load(f)
aux=obtener_dataframe(data)
aux
print(aux.groupby(['entity_group']).size())
aux.groupby(['entity_group', 'word']).size()

entity_group
DATE    4
LOC     6
MISC    9
ORG     3
PER     2
dtype: int64


entity_group  word          
DATE          1998              1
              2001              1
              April 5           1
              April 8           1
LOC           Cydonia           1
              Mars              5
MISC          American          1
              Egyption Phara    1
              Face On Mars      1
              Malin             1
              Mars Orbiter      1
              Martian           1
              Viking            1
              Viking 1          1
              West              1
ORG           NASA              3
PER           ##oh              1
              Michael Malin     1
dtype: int64

# Prueba del modelo local distilbert-tuned-4labels


In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch


model_name = "distilbert-tuned-4labels" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

text = "My name is James Cameron. I work at Microsoft and live in Berlin since April 21 1999"

# Tokeniza el texto
inputs = tokenizer(text, return_tensors="pt")
# Obtener las predicciones
with torch.no_grad():
    outputs = model(**inputs)
# get logits
logits = outputs.logits

# get labels
predictions = torch.argmax(logits, dim=2)

# Map id to the labels
label_ids = predictions[0].tolist()
labels = [model.config.id2label[label_id] for label_id in label_ids]

# Mostrar los tokens y las etiquetas
tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
for token, label in zip(tokens, labels):
    print(f"Token: {token}, Label: {label}")    

Token: [CLS], Label: O
Token: My, Label: O
Token: name, Label: O
Token: is, Label: O
Token: James, Label: B-PER
Token: Cameron, Label: I-PER
Token: ., Label: O
Token: I, Label: O
Token: work, Label: O
Token: at, Label: O
Token: Microsoft, Label: B-ORG
Token: and, Label: O
Token: live, Label: O
Token: in, Label: O
Token: Berlin, Label: B-LOC
Token: since, Label: O
Token: April, Label: B-DATE
Token: 21, Label: I-DATE
Token: 1999, Label: O
Token: [SEP], Label: O
