# Installations

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
!pip install transformers



# Imports

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import os
import transformers

from transformers import TFAutoModelForTokenClassification, TFBertModel, AutoTokenizer
from transformers import pipeline

from datasets import Dataset, DatasetDict

from google.colab import drive

ModuleNotFoundError: No module named 'datasets'

# NER

## Datasets

In [None]:
!pip install awscli --upgrade

Collecting awscli
  Downloading awscli-1.36.18-py3-none-any.whl.metadata (11 kB)
Collecting botocore==1.35.77 (from awscli)
  Downloading botocore-1.35.77-py3-none-any.whl.metadata (5.7 kB)
Collecting docutils<0.17,>=0.10 (from awscli)
  Downloading docutils-0.16-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from awscli)
  Downloading s3transfer-0.10.4-py3-none-any.whl.metadata (1.7 kB)
Collecting colorama<0.4.7,>=0.2.5 (from awscli)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting rsa<4.8,>=3.1.2 (from awscli)
  Downloading rsa-4.7.2-py3-none-any.whl.metadata (3.6 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from botocore==1.35.77->awscli)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Downloading awscli-1.36.18-py3-none-any.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.35.77-py3-none-any.whl (13.2 MB)


In [None]:
!aws s3 ls --no-sign-request s3://multiconer/multiconer2022/

                           PRE BN-Bangla/
                           PRE DE-German/
                           PRE EN-English/
                           PRE ES-Spanish/
                           PRE FA-Farsi/
                           PRE HI-Hindi/
                           PRE KO-Korean/
                           PRE MIX_Code_mixed/
                           PRE MULTI_Multilingual/
                           PRE NL-Dutch/
                           PRE RU-Russian/
                           PRE TR-Turkish/
                           PRE ZH-Chinese/


In [None]:
!aws s3 cp --no-sign-request s3://multiconer/multiconer2022/EN-English/ ./multiconer2022/EN-English --recursive

download: s3://multiconer/multiconer2022/EN-English/en_dev.conll to multiconer2022/EN-English/en_dev.conll
download: s3://multiconer/multiconer2022/EN-English/en_train.conll to multiconer2022/EN-English/en_train.conll
download: s3://multiconer/multiconer2022/EN-English/en_test.conll to multiconer2022/EN-English/en_test.conll


In [None]:
def parse_conll_file(filepath):
    sentences = []
    labels = []
    sentence = []
    label_seq = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                parts = line.strip().split()
                word, _, _, label = parts
                if word == '#': # drop metadata
                    continue
                sentence.append(word)
                label_seq.append(label)
            else:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label_seq)
                    sentence = []
                    label_seq = []
    if sentence:  # Add the last sentence if not followed by a newline
        sentences.append(sentence)
        labels.append(label_seq)
    return sentences, labels

X_train, y_train = parse_conll_file('./multiconer2022/EN-English/en_train.conll')
X_dev, y_dev = parse_conll_file('./multiconer2022/EN-English/en_dev.conll')
X_test, y_test = parse_conll_file('./multiconer2022/EN-English/en_test.conll')

In [None]:
print(f"Train sentences count: {len(X_train)}")
print(f"Dev sentences count: {len(X_dev)}")
print(f"Test sentences count: {len(X_test)}")

Train sentences count: 15300
Dev sentences count: 800
Test sentences count: 217818


In [None]:
print(f"Longest train sentence: {max([len(sentence) for sentence in X_train])}")
print(f"Longest dev sentence: {max([len(sentence) for sentence in X_dev])}")
print(f"Longest test sentence: {max([len(sentence) for sentence in X_test])}")

Longest train sentence: 41
Longest dev sentence: 39
Longest test sentence: 49


## Create tf dataset

In [None]:
label_to_id = {
    'B-CORP': 0, 'B-CW': 1, 'B-GRP': 2, 'B-LOC': 3, 'B-PER': 4, 'B-PROD': 5,
    'I-CORP': 6, 'I-CW': 7, 'I-GRP': 8, 'I-LOC': 9, 'I-PER': 10, 'I-PROD': 11, 'O': 12
}
id_to_label = {v: k for k, v in label_to_id.items()}

In [None]:
def tokenize_and_align_labels(examples, tokenizer, max_length=64):
    tokenized_inputs = tokenizer(examples["sentences"], truncation=True, padding="max_length", is_split_into_words=True, max_length=max_length)
    labels = []
    for i, label_seq in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                aligned_labels.append(label_to_id[label_seq[word_idx]])
            else:
                aligned_labels.append(-100)  # Only label the first subtoken
            previous_word_idx = word_idx
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# Load pre-trained tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert data to Dataset objects
train_data = {"sentences": X_train, "labels": y_train}
dev_data = {"sentences": X_dev, "labels": y_dev}
test_data = {"sentences": X_test, "labels": y_test}

train_dataset = Dataset.from_dict(train_data)
dev_dataset = Dataset.from_dict(dev_data)
test_dataset = Dataset.from_dict(test_data)

# Tokenize and align labels
tokenized_train = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
tokenized_dev = dev_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
tokenized_test = test_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/15300 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/217818 [00:00<?, ? examples/s]

In [None]:
def to_tf_dataset(dataset, batch_size=16):
    return tf.data.Dataset.from_tensor_slices((
        {
            "input_ids": np.array(dataset["input_ids"]),
            "attention_mask": np.array(dataset["attention_mask"]),
            # "token_type_ids": np.array(dataset["token_type_ids"]),
        },
        np.array(dataset["labels"])
    )).batch(batch_size)

batch_size = 16
tf_train_dataset = to_tf_dataset(tokenized_train, batch_size)
tf_dev_dataset = to_tf_dataset(tokenized_dev, batch_size)
tf_test_dataset = to_tf_dataset(tokenized_test, batch_size)

## Model

In [None]:
print("Devices available:", tf.config.list_physical_devices())

Devices available: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
def masked_sparse_categorical_crossentropy(y_true, y_pred):
    """
    Custom loss function to ignore tokens with label -100.
    """
    # Create a mask for valid labels
    mask = tf.not_equal(y_true, -100)
    # Replace -100 with 0 for loss calculation
    y_true = tf.where(mask, y_true, 0)
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
    # Apply mask
    loss = tf.where(mask, loss, 0)
    return tf.reduce_sum(loss) / tf.reduce_sum(tf.cast(mask, tf.float32))


def masked_accuracy(y_true, y_pred):
    """
    Custom accuracy metric that ignores tokens with label -100 and, optionally, padding tokens.
    """
    # Create a mask for valid labels
    label_mask = tf.not_equal(y_true, -100)
    y_pred_classes = tf.argmax(y_pred, axis=-1)
    # Compare predictions with true labels where the mask is valid
    matches = tf.equal(tf.cast(y_true, tf.int64), tf.cast(y_pred_classes, tf.int64))
    matches = tf.logical_and(matches, label_mask)

    valid_tokens = tf.reduce_sum(tf.cast(label_mask, tf.float32))
    accuracy = tf.reduce_sum(tf.cast(matches, tf.float32)) / (valid_tokens + 1e-8)  # Avoid division by zero

    return accuracy


In [None]:
def create_model(bert_model, max_len = 64, num_labels=13):
    input_ids = tf.keras.Input(shape=(max_len,), dtype = 'int32', name='input_ids')
    attention_masks = tf.keras.Input(shape=(max_len,), dtype = 'int32', name='attention_mask')
    bert_model.bert.pooler.trainable = False
    bert_output = bert_model(input_ids, attention_mask=attention_masks, return_dict =True)
    embedding = tf.keras.layers.Dropout(0.3)(bert_output["last_hidden_state"])
    output = tf.keras.layers.Dense(num_labels, activation='softmax')(embedding)
    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs = [output])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, clipvalue=1.0), loss=masked_sparse_categorical_crossentropy, metrics=[masked_accuracy])
    return model

strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    bert_model = TFBertModel.from_pretrained('bert-base-uncased', num_labels=13)

    model = create_model(bert_model, num_labels=13)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
history_ner = model.fit(tf_train_dataset, validation_data=tf_dev_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
def ne_recognition(model, sentence):
  tokenized_inputs = tokenizer(
    sentence.split(),
    truncation=True,
    padding="max_length",
    is_split_into_words=True,
    max_length=64,
    return_tensors="tf"
  )

  label_to_id = {
    'B-CORP': 0, 'B-CW': 1, 'B-GRP': 2, 'B-LOC': 3, 'B-PER': 4, 'B-PROD': 5,
    'I-CORP': 6, 'I-CW': 7, 'I-GRP': 8, 'I-LOC': 9, 'I-PER': 10, 'I-PROD': 11, 'O': 12
  }
  id_to_label = {v: k for k, v in label_to_id.items()}

  if "token_type_ids" in tokenized_inputs:
    del tokenized_inputs["token_type_ids"]
  inputs = {key: np.array(value) for key, value in tokenized_inputs.items()}

  y_pred = model.predict(inputs)
  y_pred_classes = tf.argmax(y_pred, axis=-1).numpy()[0]

  # Get word IDs to map predictions to words
  word_ids = tokenized_inputs.word_ids(batch_index=0)

  # Map predictions to original words
  predicted_labels = []
  current_word = None
  for idx, word_id in enumerate(word_ids):
      if word_id is None:  # Special tokens like [CLS], [SEP], or padding
          continue
      if word_id != current_word:  # First subtoken of a word
          predicted_labels.append(id_to_label[y_pred_classes[idx]])
          current_word = word_id
      # Ignore subsequent subtokens of the same word (if needed)

  return predicted_labels

def print_labels(sentence, predicted_labels):
  print("Original Sentence:", sentence)
  print("Words:", sentence.split())
  print("Predicted Labels:", predicted_labels)
  for word, label in zip(sentence.split(), predicted_labels):
      print(f'({word} = {label})')

In [None]:
sentence = "I love steve jobs, but when he created the iphone 15 was the worst phone ever"

predicted_labels = ne_recognition(model, sentence)
print_labels(sentence, predicted_labels)

Original Sentence: I love steve jobs, but when he created the iphone 15 was the worst phone ever
Words: ['I', 'love', 'steve', 'jobs,', 'but', 'when', 'he', 'created', 'the', 'iphone', '15', 'was', 'the', 'worst', 'phone', 'ever']
Predicted Labels: ['O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O']
(I = O)
(love = O)
(steve = B-PER)
(jobs, = I-PER)
(but = O)
(when = O)
(he = O)
(created = O)
(the = O)
(iphone = B-PROD)
(15 = I-PROD)
(was = O)
(the = O)
(worst = O)
(phone = O)
(ever = O)


In [None]:
# Save the model
model_weights_path = "./NER_BERT_weights_09_12_2024/saved_weights"
model.save_weights(model_weights_path)

# Save the tokenizer
tokenizer_save_path = "./tokenizer_NER_09_12_2024"
tokenizer.save_pretrained(tokenizer_save_path)

('./tokenizer_NER_09_12_2024/tokenizer_config.json',
 './tokenizer_NER_09_12_2024/special_tokens_map.json',
 './tokenizer_NER_09_12_2024/vocab.txt',
 './tokenizer_NER_09_12_2024/added_tokens.json',
 './tokenizer_NER_09_12_2024/tokenizer.json')

In [None]:
drive.mount('/content/drive')

!mkdir -p /content/drive/MyDrive/NER_model
!cp -r ./NER_BERT_weights_09_12_2024 /content/drive/MyDrive/NER_model
!cp -r ./tokenizer_NER_09_12_2024 /content/drive/MyDrive/NER_model

Mounted at /content/drive


# Sentiment model

In [None]:
def entities_positions(sentence, predicted_labels):
  entities = []
  positions = []
  entity = None
  entity_label = None
  for i, (word, label) in enumerate(zip(sentence.split(), predicted_labels)):
      if label.startswith('B'):
          entity = word
          position = i
      elif label.startswith('I'):
          entity = entity + ' ' + word
      elif entity is not None:
          entities.append(entity)
          positions.append(position)
          entity = None
  if entity:
      entities.append(entity)
      positions.append(position)

  return entities, positions


def get_context(entities, positions):
  sentiment_inputs = []
  for entity, position in zip(entities, positions):
      split_sentence = np.array(sentence.split())
      context = ' '.join(split_sentence[max(0, position - 5):min(len(split_sentence), position + 5)])
      sentiment_inputs.append(f"Entity: {entity}. Context: {context}")

  return sentiment_inputs


def predict_sentiment(sentiment_inputs):
  sentiment_analyzer = pipeline("sentiment-analysis")

  # Perform sentiment analysis
  entity_sentiments = []
  for sentiment_input in sentiment_inputs:
      sentiment_result = sentiment_analyzer(sentiment_input)
      entity_sentiments.append(sentiment_result[0])

  # Combine entities with their sentiment
  entity_sentiments_combined = [
      {"entity": entity, "sentiment": sentiment}
      for entity, sentiment in zip(entities, entity_sentiments)
  ]

  return entity_sentiments_combined

## Sentiment tests

In [None]:
path = '/content/drive/MyDrive'

strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    bert_model = TFBertModel.from_pretrained('bert-base-uncased', num_labels=13)

    model = create_model(bert_model, num_labels=13)

# Load the saved weights
model_weights_path = f"{path}/NER_model/NER_BERT_weights_09_12_2024/saved_weights"
model.load_weights(model_weights_path)

# Load the tokenizer
tokenizer_save_path = f"{path}/NER_model/tokenizer_NER_09_12_2024"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

print("Model and tokenizer successfully loaded!")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model and tokenizer successfully loaded!


In [None]:
sentence = "I love steve jobs, but when he created the iphone 15, it was the worst phone ever"

predicted_labels = ne_recognition(model, sentence)
print_labels(sentence, predicted_labels)

Original Sentence: I love steve jobs, but when he created the iphone 15, it was the worst phone ever
Words: ['I', 'love', 'steve', 'jobs,', 'but', 'when', 'he', 'created', 'the', 'iphone', '15,', 'it', 'was', 'the', 'worst', 'phone', 'ever']
Predicted Labels: ['O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O']
(I = O)
(love = O)
(steve = B-PER)
(jobs, = I-PER)
(but = O)
(when = O)
(he = O)
(created = O)
(the = O)
(iphone = B-PROD)
(15, = I-PROD)
(it = O)
(was = O)
(the = O)
(worst = O)
(phone = O)
(ever = O)


In [None]:
entities, positions = entities_positions(sentence, predicted_labels)
sentiment_inputs = get_context(entities, positions)
entity_sentiments_combined = predict_sentiment(sentiment_inputs)

entity_sentiments_combined

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'steve jobs,',
  'sentiment': {'label': 'POSITIVE', 'score': 0.9965392351150513}},
 {'entity': 'iphone 15,',
  'sentiment': {'label': 'NEGATIVE', 'score': 0.969535231590271}}]

In [None]:
sentence = "Elon Musk is lovely and I enjoy Tesla company very much"

predicted_labels = ne_recognition(model, sentence)
print_labels(sentence, predicted_labels)

Original Sentence: Elon Musk is lovely and I enjoy Tesla company very much
Words: ['Elon', 'Musk', 'is', 'lovely', 'and', 'I', 'enjoy', 'Tesla', 'company', 'very', 'much']
Predicted Labels: ['B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'B-CORP', 'I-CORP', 'O', 'O']
(Elon = B-PER)
(Musk = I-PER)
(is = O)
(lovely = O)
(and = O)
(I = O)
(enjoy = O)
(Tesla = B-CORP)
(company = I-CORP)
(very = O)
(much = O)


In [None]:
entities, positions = entities_positions(sentence, predicted_labels)
sentiment_inputs = get_context(entities, positions)
entity_sentiments_combined = predict_sentiment(sentiment_inputs)

entity_sentiments_combined

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'Elon Musk',
  'sentiment': {'label': 'POSITIVE', 'score': 0.9998142123222351}},
 {'entity': 'Tesla company',
  'sentiment': {'label': 'POSITIVE', 'score': 0.9998756647109985}}]

In [None]:
sentence = "I hate it when Carrefour discounts all items"

predicted_labels = ne_recognition(model, sentence)

print_labels(sentence, predicted_labels)

Original Sentence: I hate it when Carrefour discounts all items
Words: ['I', 'hate', 'it', 'when', 'Carrefour', 'discounts', 'all', 'items']
Predicted Labels: ['O', 'O', 'O', 'O', 'B-CORP', 'O', 'O', 'O']
(I = O)
(hate = O)
(it = O)
(when = O)
(Carrefour = B-CORP)
(discounts = O)
(all = O)
(items = O)


In [None]:
entities, positions = entities_positions(sentence, predicted_labels)
sentiment_inputs = get_context(entities, positions)
entity_sentiments_combined = predict_sentiment(sentiment_inputs)

entity_sentiments_combined

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'Carrefour',
  'sentiment': {'label': 'NEGATIVE', 'score': 0.998363196849823}}]

In [None]:
sentence = "I absolutely loved the main character, Buzz Astral, in Toy Story, but the ending of the movie was terribly disappointing"

predicted_labels = ne_recognition(model, sentence)

print_labels(sentence, predicted_labels)

Original Sentence: I absolutely loved the main character, Buzz Astral, in Toy Story, but the ending of the movie was terribly disappointing
Words: ['I', 'absolutely', 'loved', 'the', 'main', 'character,', 'Buzz', 'Astral,', 'in', 'Toy', 'Story,', 'but', 'the', 'ending', 'of', 'the', 'movie', 'was', 'terribly', 'disappointing']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-CW', 'I-CW', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
(I = O)
(absolutely = O)
(loved = O)
(the = O)
(main = O)
(character, = O)
(Buzz = B-PER)
(Astral, = I-PER)
(in = O)
(Toy = B-CW)
(Story, = I-CW)
(but = O)
(the = O)
(ending = O)
(of = O)
(the = O)
(movie = O)
(was = O)
(terribly = O)
(disappointing = O)


In [None]:
entities, positions = entities_positions(sentence, predicted_labels)
sentiment_inputs = get_context(entities, positions)
entity_sentiments_combined = predict_sentiment(sentiment_inputs)

entity_sentiments_combined

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'Buzz Astral,',
  'sentiment': {'label': 'POSITIVE', 'score': 0.9997194409370422}},
 {'entity': 'Toy Story,',
  'sentiment': {'label': 'NEGATIVE', 'score': 0.9420816898345947}}]

In [None]:
sentence = "Tesla's recent quality control issues have left many customers disappointed and questioning the company's commitment to excellence."

predicted_labels = ne_recognition(model, sentence)

print_labels(sentence, predicted_labels)

Original Sentence: Tesla's recent quality control issues have left many customers disappointed and questioning the company's commitment to excellence.
Words: ["Tesla's", 'recent', 'quality', 'control', 'issues', 'have', 'left', 'many', 'customers', 'disappointed', 'and', 'questioning', 'the', "company's", 'commitment', 'to', 'excellence.']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
(Tesla's = O)
(recent = O)
(quality = O)
(control = O)
(issues = O)
(have = O)
(left = O)
(many = O)
(customers = O)
(disappointed = O)
(and = O)
(questioning = O)
(the = O)
(company's = O)
(commitment = O)
(to = O)
(excellence. = O)


In [None]:
entities, positions = entities_positions(sentence, predicted_labels)
sentiment_inputs = get_context(entities, positions)
entity_sentiments_combined = predict_sentiment(sentiment_inputs)

entity_sentiments_combined

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[]

In [None]:
sentence = "While Google's new Pixel phone boasts impressive camera features, many users have expressed frustration with its high price."

predicted_labels = ne_recognition(model, sentence)

print_labels(sentence, predicted_labels)

Original Sentence: While Google's new Pixel phone boasts impressive camera features, many users have expressed frustration with its high price.
Words: ['While', "Google's", 'new', 'Pixel', 'phone', 'boasts', 'impressive', 'camera', 'features,', 'many', 'users', 'have', 'expressed', 'frustration', 'with', 'its', 'high', 'price.']
Predicted Labels: ['O', 'B-CORP', 'O', 'I-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
(While = O)
(Google's = B-CORP)
(new = O)
(Pixel = I-PROD)
(phone = I-PROD)
(boasts = O)
(impressive = O)
(camera = O)
(features, = O)
(many = O)
(users = O)
(have = O)
(expressed = O)
(frustration = O)
(with = O)
(its = O)
(high = O)
(price. = O)


In [None]:
# sentence = "the great steve jobs died of ligma in 1997, after releasing the iphone 15, which suffered from many problems"