# Getting ner tags from camembert-ner 

https://huggingface.co/Jean-Baptiste/camembert-ner

In [None]:
! pip install transformers datasets # HuggingFace 🤗
! pip install sentencepiece # Required for Camembert-ner (slow tokenizer)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m


In [None]:
#Import file
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

## Libraries

In [None]:
import re
import pandas as pd
import transformers
from transformers import (AutoTokenizer, 
                          AutoModelForTokenClassification,
                          pipeline)
from datasets import Dataset

## Functions

In [None]:
def df_to_iob(df, file):
    '''
    Function to convert a DataFrame (df)
    containing tag predicted by camembert-ner 
    into the IOB format and save it as a txt (file)
    '''
    with open(file, 'w') as f:
        for i, row in df.iterrows():
            text = row['line']
            entities = row['entity_group']
            start_indices = row['start']
            end_indices = row['end']
            words = text.split()
            tags = ['O'] * len(words)

            # Find the word tagged by camembert-ner
            for j, entity in enumerate(entities):
                start = start_indices[j]
                end = end_indices[j]
                entity_tokens = text[start:end].split()

                # Associate the word to its IOB tag
                for k, word in enumerate(words):
                    if k >= len(tags):
                        break
                    word_start = sum([len(x) + 1 for x in words[:k]])
                    word_end = word_start + len(word)
                    if (start < word_end) and (end > word_start):
                        # If the entity only corresponds to one word, set its tag as 'B-entity'
                        if len(entity_tokens) == 1:
                            tags[k] = 'B-' + entity
                            break
                        # If the entity spans multiple words, set the following words as 'I-entity'
                        else:
                            tags[k] = 'B-' + entity
                            for l in range(k+1, len(words)):
                              # Check if the current word is part of the entity
                              word_start = sum([len(x) + 1 for x in words[:l]])
                              word_end = sum([len(x) + 1 for x in words[:l+1]])
                              if start < word_end and end > word_start:
                                tags[l] = 'I-' + entity
                              else:
                                break
                            break

            # Pair a word and its tag and split each pair into a new line
            words_tags = [f"{word} {tag}\n" for word, tag in zip(words, tags)]
            f.write(''.join(words_tags) + '\n')

## Get ner tags predictions and create IOB files

In [None]:
# Load lyrics in a DataFrame
df = pd.read_csv('/content/jul_lyrics.csv', index_col=0, header=0)

# Load model
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")

# Create pipeline 
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0)

In [None]:
# Convert df to a HuggingFace dataset and get predictions by camembert-ner
dataset = Dataset.from_pandas(df)
dataset_preds = dataset.map(lambda x: {"predictions": nlp(x['line'])}, batched=True, batch_size=128)

# Convert dataset back to a DataFrame for easier processing
df = Dataset.to_pandas(dataset_preds)

Map:   0%|          | 0/54493 [00:00<?, ? examples/s]



In [None]:
# Remove previous index column "__index_level_0__" 	 
df.drop("__index_level_0__", axis=1, inplace=True)

# Exrtact information inside predictions column and create new columns
df[['entity_group', 'word', 'start', 'end']] = df['predictions'].apply(lambda x: pd.Series([
    [d['entity_group'] for d in x],
    [d['word'] for d in x],
    [d['start'] for d in x],
    [d['end'] for d in x]]))
df

Unnamed: 0,line,predictions,entity_group,word,start,end
0,#FreestyleCassageDeNuques,"[{'end': 25, 'entity_group': 'MISC', 'score': ...",[MISC],[#FreestyleCassageDeNuques],[0],[25]
1,T'as vu l'instru comme j'la nique,[],[],[],[],[]
2,"Quand j'rappe, j'les mets en panique",[],[],[],[],[]
3,J'suis d'humeur volcanique,[],[],[],[],[]
4,"J'ai ma sacoche, faut pas qu'tu t'approches, j...",[],[],[],[],[]
...,...,...,...,...,...,...
54488,"La zone en personne, la zone en personne","[{'end': 7, 'entity_group': 'LOC', 'score': 0....","[LOC, MISC]","[La zone, la zone]","[0, 20]","[7, 28]"
54489,"Top chrono, à deux sur la moto",[],[],[],[],[]
54490,Asalto avec un bon poto,"[{'end': 6, 'entity_group': 'PER', 'score': 0....",[PER],[Asalto],[0],[6]
54491,"On veut le magot, Plata O Plomo","[{'end': 31, 'entity_group': 'LOC', 'score': 0...",[LOC],[Plata O Plomo],[17],[31]


In [None]:
# Split data
df_train = df.iloc[0:int(len(df)*0.6)]
df_test = df.iloc[int(len(df)*0.6):int(len(df)*0.8)]
df_dev = df.iloc[int(len(df)*0.8):]

In [None]:
# Transform data
df_to_iob(df_train, '/content/train.txt')
df_to_iob(df_test, '/content/test.txt')
df_to_iob(df_dev, '/content/dev.txt')