### https://huggingface.co/transformers has pipelines, pretrained models, tokenizers etc for several tasks. 

In [1]:
import torch
from transformers import pipeline

#is GPU available?
gpu = torch.cuda.is_available()

#defining device where to to the computation
#device = torch.device(0) if gpu else torch.device('cpu')
device="cpu"

### Sentiment analysis: recall we used it in our introductory lecture

In [2]:
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)

In [3]:
result = classifier("I hate washing dishes")[0]
print("Sentiment: {} Score: {}".format(result['label'],result['score']))

result = classifier("I love visiting Paris")[0]
print("Sentiment: {} Score: {}".format(result['label'],result['score']))

Sentiment: NEGATIVE Score: 0.9992689490318298
Sentiment: POSITIVE Score: 0.999553382396698


### You can also fine tune any pretrained model. For instance, let's fine tune a model for sentiment analysis

### Load a dataset for fine-tuning. You can get imdbs.csv from
### https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-

In [4]:
import datasets
from datasets import load_dataset

dataset = load_dataset('csv', data_files='./imdbs.csv', split='train')

In [5]:
# split dataset into train and test

dataset   = dataset.train_test_split(test_size=0.1)
train_set = dataset['train']
test_set  = dataset['test']

### Load the tokenizer and preprocess the training and test sets with the tokenizer -- it already converts tokens into ids and sets attention masks

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# preprocess the dataset 

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_set = train_set.map(tokenize_function, batched=True)
test_set  = test_set.map(tokenize_function, batched=True) 

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

### Load the model for sequence classification

In [7]:
from transformers import AutoModelForSequenceClassification

checkpoint = "bert-base-cased"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Since we want to report the accuracy of the model, we can add the following function.

In [8]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


### Now set training parameters and arguments, and train the model

In [9]:
from transformers import TrainingArguments, Trainer

In [10]:
# set training parameters and arguments

batch_size = 8
epochs     = 20
warmup_steps = 100
weight_decay = 0.01

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    eval_strategy="epoch",
    logging_dir='./logs',    
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
# define Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics
)

In [12]:
# Train

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.704127,0.4
2,No log,0.671948,0.6
3,No log,0.648823,0.7
4,No log,0.630535,0.7
5,No log,0.611535,0.7
6,No log,0.55816,0.7
7,No log,0.487023,0.7
8,No log,0.575655,0.7
9,No log,0.666871,0.8
10,No log,1.22814,0.7




TrainOutput(global_step=120, training_loss=0.23227124214172362, metrics={'train_runtime': 63.1974, 'train_samples_per_second': 28.482, 'train_steps_per_second': 1.899, 'total_flos': 473599899648000.0, 'train_loss': 0.23227124214172362, 'epoch': 20.0})

In [13]:
# evaluate 

trainer.evaluate()



{'eval_loss': 0.005834068171679974,
 'eval_accuracy': 1.0,
 'eval_runtime': 0.1345,
 'eval_samples_per_second': 74.332,
 'eval_steps_per_second': 7.433,
 'epoch': 20.0}

In [14]:
# test

inputs  = tokenizer('High tech companies are growing up', return_tensors="pt").to(device=0)
labels  = torch.tensor([1]).unsqueeze(0).to(device=0)
outputs = model(**inputs, labels=labels)
loss    = outputs.loss
logits  = outputs.logits
answer  = torch.argmax(logits)
if (answer == 0):
    print("Sentiment: NEGATIVE")
else:
    print("Sentiment: POSITIVE")

Sentiment: POSITIVE


### Extractive Question Answering: the task of extracting an answer from a text given a question

In [15]:
question_answerer = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad", device=device)

In [16]:
context = "The immune system is a system of many biological structures and processes \
within an organism that protects against diseases. To function properly the immune system \
must detect a wide variety of agents, called pathogens."

result = question_answerer(question="What are pathogens?", context=context)
print("Answer: {}".format(result['answer']))
print("Score: {}".format(round(result['score'], 4)))
print("Start: {} End: {}".format(result['start'],result['end']))

result = question_answerer(question="How does the immune system work?", context=context)
print("Answer: {}".format(result['answer']))
print("Score: {}".format(round(result['score'], 4)))
print("Start: {} End: {}".format(result['start'],result['end']))

result = question_answerer(question="What is the immune system?", context=context)
print("Answer: {}".format(result['answer']))
print("Score: {}".format(round(result['score'], 4)))
print("Start: {} End: {}".format(result['start'],result['end']))

Answer: a wide variety of agents
Score: 0.4355
Start: 176 End: 200
Answer: must detect a wide variety of agents
Score: 0.058
Start: 164 End: 200
Answer: a system of many biological structures and processes within an organism that protects against diseases
Score: 0.4359
Start: 21 End: 123


### You may also use pretrained models already fine-tuned in some dataset (e.g., SQUAD -- Stanford Question-Answering Dataset).

In [17]:
from transformers import BertForQuestionAnswering, BertTokenizer
import torch

In [18]:
# load the fine-tuned model

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
# load its fine-tuned tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [20]:
# add BERT tags to process question and context

question = "[CLS]" + "What are pathogens?" + "[SEP]"
context  = context + "[SEP]"

In [21]:
# get their tokens, combine and convert them into input_ids
question_tokens = tokenizer.tokenize(question)
context_tokens  = tokenizer.tokenize(context)
all_tokens      = question_tokens + context_tokens
input_ids       = tokenizer.convert_tokens_to_ids(all_tokens)

In [22]:
# define segment_ids with zeroes for question tokens and ones for context tokens

segment_ids = [0] * len(question_tokens)
segment_ids = segment_ids + [1] * len(context_tokens)

In [23]:
# convert input and segment ids to tensors and feed them into the model 
# to obtain the start and end scores

input_ids              = torch.tensor([input_ids])
segment_ids            = torch.tensor([segment_ids])

In [24]:
result = model(input_ids, token_type_ids = segment_ids)

In [25]:
#tokens with highest start and end scores
answer_start = torch.argmax(result.start_logits)
answer_end   = torch.argmax(result.end_logits)
if answer_end >= answer_start:
    answer = " ".join(all_tokens[answer_start:answer_end+1])
    print("\nQuestion:{}".format(question[5:-5]))
    print("\nAnswer: {}.".format(answer))
else:
    print("I could not find an answer to your question.")


Question:What are pathogens?

Answer: agents.


### Text Generation

In [35]:
text_generator = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", device=device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [36]:
text = text_generator("It is a strong idea", truncation=True, max_length=35, do_sample=False)
print(text)

[{'generated_text': 'It is a strong idea that the government should not interfere with the free market.\n\n# Answer\nThe statement "It is a strong idea that the government should'}]


### Named Entity Recognition

In [29]:
ner_pipe = pipeline("ner", model="dslim/bert-base-NER", device=device)

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [37]:
text = "IBM introduces Eagle in USA -- the first processor to surpass 100 qubits."
result = ner_pipe(text)
for entity in result:
    print(entity)

{'entity': 'B-ORG', 'score': 0.9988896, 'index': 1, 'word': 'IBM', 'start': 0, 'end': 3}
{'entity': 'B-MISC', 'score': 0.81814283, 'index': 3, 'word': 'Eagle', 'start': 15, 'end': 20}
{'entity': 'B-LOC', 'score': 0.99930835, 'index': 5, 'word': 'USA', 'start': 24, 'end': 27}


### Text Summarization

In [38]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [39]:
text = "IBM has unveiled an advanced quantum processor that is part of an effort \
to build super-fast computers. These machines could revolutionise computing, harnessing \
the strange world of quantum physics to solve problems beyond reach for even the most \
advanced classical ones. But the hurdles in building practical, large-scale versions \
have kept quantum computers confined to the lab. The new chip has 127 qubits, \
twice as many as the previous IBM processor. Qubits (quantum bits) are the most basic \
units of information in a quantum computer. The company called its new Eagle processor \
a key milestone on the path towards practical quantum computation."

In [40]:
result = summarizer(text, max_length=50, min_length=30, do_sample=False)

In [41]:
print(result[0]['summary_text'])

IBM has unveiled an advanced quantum processor that is part of an effort to build super-fast computers. These machines could revolutionise computing, harnessing the strange world of quantum physics to solve problems beyond reach for even the most advanced classical


### Translation: For example, from English to French

In [44]:
translator = pipeline("translation_en_to_fr", model = "Faizyhugging/English_to_French", device=device)

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

In [45]:
result = translator("The book is on the table.")
print(result[0]['translation_text'])

Le livre est sur la table.
