## Sentiment Analysis

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import torch

In [None]:
task_ = "sentiment-analysis"
def get_sentiment(text, model, tokenizer, task=task_):
  pipe = pipeline(task=task, model=model, tokenizer=tokenizer)
  sentiment_result = pipe(text)
  return sentiment_result

In [None]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

In [None]:
text = ['Your tone in written communication can be fair formal.',
        'You often fail to follow up with customers as promised.',
        'Your tone in written communication is too offensive.',
        'I hate your production',
        'من این فیلم رو دوست نداشتم']

In [None]:
get_sentiment(text, model, tokenizer)

[{'label': '4 stars', 'score': 0.37983793020248413},
 {'label': '4 stars', 'score': 0.3468973636627197},
 {'label': '2 stars', 'score': 0.47493189573287964},
 {'label': '1 star', 'score': 0.8062731027603149},
 {'label': '1 star', 'score': 0.42411860823631287}]

## Name Entity Recognition

In [None]:
from transformers import BertForTokenClassification
task_ = 'ner'

In [None]:
def get_ner_results(text, model, tokenizer, task=task_):
    nlp = pipeline(task, model=model, tokenizer=tokenizer)
    ner_results = nlp(text)
    # Calculate the character start and end positions of each entity manually
    tokens = tokenizer.tokenize(text)
    offset = 0  # Offset to track character positions in the original text
    updated_results = []
    for result in ner_results:
        word = result['word'].lstrip("##")  # Remove subword prefix if present
        start = text.find(word, offset)
        end = start + len(word)
        if start != -1:  # Only update if the word was found
            result['start'] = start
            result['end'] = end
            updated_results.append(result)
            offset = end  # Update offset to next possible start position
    return updated_results

def highlight_entities(text, ner_results):
    highlighted_text = text
    # Reverse sort by start index to not mess up the indices when adding brackets
    for entity in sorted(ner_results, key=lambda x: x['start'], reverse=True):
        start, end = entity['start'], entity['end']
        highlighted_text = highlighted_text[:end] + "]" + highlighted_text[end:]
        highlighted_text = highlighted_text[:start] + "[" + highlighted_text[start:]
    return highlighted_text

In [None]:
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
text = "Barack Obama was born in Hawaii, 1955, worked at Google. He served as the president of the United States."
ner_result = get_ner_results(text, model, tokenizer)
highlight_entities(text, ner_result)

'[Barack] [Obama] was born in [Hawaii], 1955, worked at [Google]. He served as the president of the [United] [States].'

## Text Summarization

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

In [None]:
def get_summary(text, model, tokenizer, max_len, min_len):
  summarizer = pipeline(task='summarization', model=model, tokenizer=tokenizer)
  summary = summarizer(text, max_length=max_len, min_length=min_len)
  return summary[0]['summary_text']



In [None]:
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [None]:
text = "The Gutenberg Bible, also known as the 42-line Bible, the Mazarin Bible or the B42, was the earliest major book printed in Europe using mass-produced metal movable type. It marked the start of the \"Gutenberg Revolution\" and the age of printed books in the West. The book is valued and revered for its high aesthetic and artistic qualities[1] and its historical significance. The Gutenberg Bible is an edition of the Latin Vulgate printed in the 1450s by Johannes Gutenberg in Mainz, in present-day Germany. Forty-nine copies (or substantial portions of copies) have survived. They are thought to be among the world's most valuable books, although no complete copy has been sold since 1978.[2][3] In March 1455, the future Pope Pius II wrote that he had seen pages from the Gutenberg Bible displayed in Frankfurt to promote the edition, and that either 158 or 180 copies had been printed. The 36-line Bible, said to be the second printed Bible, is also sometimes referred to as a Gutenberg Bible, but may be the work of another printer."
summary = get_summary(text, model, tokenizer, max_len=10, min_len=2)
print(summary)
print(len(text))
print(len(summary))

The Gutenberg Bible is an edition of
1035
36
