In [32]:
import numpy as np
from transformers import pipeline

# Sentiment classification

In [2]:
sent_cls = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [22]:
sent_text1 = "I've been waiting for a HuggingFace course my whole life."
sent_text2 = "I've been waiting for a HuggingFace course for several years."
sent_text3 = "I've been waiting for a HuggingFace course for several years, and I'm glad it's finally here."
sent_text4 = "I've been waiting for a HuggingFace course, and now there is one."
sent_text5 = "I've taken a HuggingFace course."

## One-at-a-time

In [23]:
res1 = sent_cls(sent_text1)
res2 = sent_cls(sent_text2)
res3 = sent_cls(sent_text3)
res4 = sent_cls(sent_text4)
res5 = sent_cls(sent_text5)
print(res1)
print(res2)
print(res3)
print(res4)
print(res5)

[{'label': 'POSITIVE', 'score': 0.9598049521446228}]
[{'label': 'NEGATIVE', 'score': 0.989154040813446}]
[{'label': 'POSITIVE', 'score': 0.9995457530021667}]
[{'label': 'NEGATIVE', 'score': 0.8055378198623657}]
[{'label': 'POSITIVE', 'score': 0.7428491711616516}]


## As a list

In [24]:
res_ls = sent_cls([sent_text1, sent_text2, sent_text3, sent_text4, sent_text5])

In [25]:
print(*res_ls, sep='\n')

{'label': 'POSITIVE', 'score': 0.9598049521446228}
{'label': 'NEGATIVE', 'score': 0.989154040813446}
{'label': 'POSITIVE', 'score': 0.9995457530021667}
{'label': 'NEGATIVE', 'score': 0.8055378198623657}
{'label': 'POSITIVE', 'score': 0.7428491711616516}


# Zero-shot classification

In [44]:
zeroshot_cls = pipeline("zero-shot-classification")

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [100]:
cls_text1 = "This course covers content on the Transformers in HuggingFace."
cls_text2 = "The BA 3.21 course covers management of mergers and acquisitions. It includes detailed analysis of recent business events, for example, the merger of Arizona Tea with Pepsi Cola in 2015, as reported on by the Harvard Business Review and Bloomberg."
cls_text3 = "The song, I Love to Dance, was released by Run Around, a popular South African electronic duo."
label_options = ["education", "politics", "business", "south african electronic dance music"]

In [101]:
res1 = zeroshot_cls(cls_text1, candidate_labels=label_options)
res2 = zeroshot_cls(cls_text2, candidate_labels=label_options)
res3 = zeroshot_cls(cls_text3, candidate_labels=label_options)

In [102]:
print(res1)
print(res2)
print(res3)

{'sequence': 'This course covers content on the Transformers in HuggingFace.', 'labels': ['education', 'business', 'south african electronic dance music', 'politics'], 'scores': [0.8080251216888428, 0.09419836103916168, 0.05079915374517441, 0.046977367252111435]}
{'sequence': 'The BA 3.21 course covers management of mergers and acquisitions. It includes detailed analysis of recent business events, for example, the merger of Arizona Tea with Pepsi Cola in 2015, as reported on by the Harvard Business Review and Bloomberg.', 'labels': ['business', 'education', 'politics', 'south african electronic dance music'], 'scores': [0.9588165283203125, 0.01732027344405651, 0.013051297515630722, 0.010811956599354744]}
{'sequence': 'The song, I Love to Dance, was released by Run Around, a popular South African electronic duo.', 'labels': ['south african electronic dance music', 'business', 'education', 'politics'], 'scores': [0.9960295557975769, 0.0017729271203279495, 0.0012527249054983258, 0.0009448

# Text generation

In [112]:
text_gen1 = pipeline("text-generation")

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [113]:
gen_text1 = "In this course, we will teach you how to"

In [118]:
res1 = text_gen1(gen_text1, max_new_tokens=50, num_return_sequences=3)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [119]:
print(*res1, sep='\n')

{'generated_text': 'In this course, we will teach you how to write JavaScript functions with jQuery and ES6. This course will expand on our previous ES6 course, which covers how to write JavaScript functions, such as JavaScript with jQuery and ES6.\n\nThis is a 3 part series.\n\nYou will'}
{'generated_text': 'In this course, we will teach you how to perform successful jobs on video on the web, and show you how to use the tools and APIs behind it to create the best results for all of you.\n\nLearn: How to Create A Job With A Video Platform and an App\n\n\nThis'}
{'generated_text': 'In this course, we will teach you how to manage your own social media strategies.\n\nWhat if I toldSSL to send an email that was a fake?\n\nSending an email with a fake message will cause your Social Media accounts to disappear and will prevent your social media accounts from being'}


## Specific model

In [116]:
text_gen2 = pipeline("text-generation", model="distilgpt2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [121]:
res1 = text_gen2(gen_text1, max_new_tokens=50, num_return_sequences=3)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [122]:
print(*res1, sep='\n')

{'generated_text': 'In this course, we will teach you how to properly test and understand the fundamentals of human psychology — the key to understanding the mind, how to recognize the emotional, psychological, emotional, and psychological responses to everyday life. Learn how to use it to improve your results over time, and learn how to'}
{'generated_text': 'In this course, we will teach you how to use a language that integrates both traditional and high level languages.\n\n\n\n\nTo read more related material on this course, click here: In the next section, I look at some of the most important points from this course:\n1. Introduction'}
{'generated_text': 'In this course, we will teach you how to build a powerful system in its entirety that lets you build powerful hardware. All the benefits of making hardware work for you can be found in our Beginners tutorial to build a powerful system that lets you build amazing hardware.'}


# Mask filling

In [137]:
fill_mask = pipeline("fill-mask", model="distilroberta-base")

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [138]:
mask_text1 = "This course will teach you all about <mask> models."
mask_text2 = "This course about NLP will teach you all about <mask> models."

In [139]:
res1 = fill_mask(mask_text1, top_k=3)
res2 = fill_mask(mask_text2, top_k=3)

In [140]:
print(*res1, sep='\n')
print(*res2, sep='\n')

{'score': 0.1961979866027832, 'token': 30412, 'token_str': ' mathematical', 'sequence': 'This course will teach you all about mathematical models.'}
{'score': 0.04052741825580597, 'token': 38163, 'token_str': ' computational', 'sequence': 'This course will teach you all about computational models.'}
{'score': 0.03301801159977913, 'token': 27930, 'token_str': ' predictive', 'sequence': 'This course will teach you all about predictive models.'}
{'score': 0.16762425005435944, 'token': 30412, 'token_str': ' mathematical', 'sequence': 'This course about NLP will teach you all about mathematical models.'}
{'score': 0.06418145447969437, 'token': 38163, 'token_str': ' computational', 'sequence': 'This course about NLP will teach you all about computational models.'}
{'score': 0.03192176669836044, 'token': 26739, 'token_str': ' neural', 'sequence': 'This course about NLP will teach you all about neural models.'}


# Named entity recognition

In [142]:
ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", grouped_entities=True)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [156]:
ner_text1 = "My name is Sylvain and I work at Hugging Face in Brooklyn."
ner_text2 = "My name is Sylvain and I work as a data scientist at Hugging Face in Brooklyn, NYC."
ner_text3 = "My name is John Doe and I work as a data scientist at Hugging Face in Brooklyn, NYC."

In [157]:
res1 = ner(ner_text1)
res2 = ner(ner_text2)
res3 = ner(ner_text3)

In [158]:
print(*res1, sep='\n')
print(*res2, sep='\n')
print(*res3, sep='\n')

{'entity_group': 'PER', 'score': 0.9981694, 'word': 'Sylvain', 'start': 11, 'end': 18}
{'entity_group': 'ORG', 'score': 0.9796019, 'word': 'Hugging Face', 'start': 33, 'end': 45}
{'entity_group': 'LOC', 'score': 0.9932106, 'word': 'Brooklyn', 'start': 49, 'end': 57}
{'entity_group': 'PER', 'score': 0.9986529, 'word': 'Sylvain', 'start': 11, 'end': 18}
{'entity_group': 'ORG', 'score': 0.9939623, 'word': 'Hugging Face', 'start': 53, 'end': 65}
{'entity_group': 'LOC', 'score': 0.99679935, 'word': 'Brooklyn', 'start': 69, 'end': 77}
{'entity_group': 'LOC', 'score': 0.99340254, 'word': 'NYC', 'start': 79, 'end': 82}
{'entity_group': 'PER', 'score': 0.9975205, 'word': 'John Doe', 'start': 11, 'end': 19}
{'entity_group': 'ORG', 'score': 0.9932844, 'word': 'Hugging Face', 'start': 54, 'end': 66}
{'entity_group': 'LOC', 'score': 0.9967991, 'word': 'Brooklyn', 'start': 70, 'end': 78}
{'entity_group': 'LOC', 'score': 0.9914799, 'word': 'NYC', 'start': 80, 'end': 83}


## Part-of-speech tagging

In [161]:
pos_tag = pipeline("token-classification", model="vblagoje/bert-english-uncased-finetuned-pos", grouped_entities=True)

Some weights of the model checkpoint at vblagoje/bert-english-uncased-finetuned-pos were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [162]:
res1 = pos_tag(ner_text1)

In [165]:
print(*res1, sep='\n')

{'entity_group': 'PRON', 'score': 0.9994592, 'word': 'my', 'start': 0, 'end': 2}
{'entity_group': 'NOUN', 'score': 0.99601364, 'word': 'name', 'start': 3, 'end': 7}
{'entity_group': 'AUX', 'score': 0.9953696, 'word': 'is', 'start': 8, 'end': 10}
{'entity_group': 'PROPN', 'score': 0.9981525, 'word': 'sylvain', 'start': 11, 'end': 18}
{'entity_group': 'CCONJ', 'score': 0.99918765, 'word': 'and', 'start': 19, 'end': 22}
{'entity_group': 'PRON', 'score': 0.9994679, 'word': 'i', 'start': 23, 'end': 24}
{'entity_group': 'VERB', 'score': 0.99923587, 'word': 'work', 'start': 25, 'end': 29}
{'entity_group': 'ADP', 'score': 0.9063106, 'word': 'at', 'start': 30, 'end': 32}
{'entity_group': 'PROPN', 'score': 0.7190516, 'word': 'hugging face', 'start': 33, 'end': 45}
{'entity_group': 'ADP', 'score': 0.9993789, 'word': 'in', 'start': 46, 'end': 48}
{'entity_group': 'PROPN', 'score': 0.9989513, 'word': 'brooklyn', 'start': 49, 'end': 57}
{'entity_group': 'PUNCT', 'score': 0.99963903, 'word': '.', 'st

# Question answering

In [167]:
qa = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

In [177]:
context1 = "My name is Sylvain and I work at Hugging Face in Brooklyn"
question1 = "Where do I work?"

In [179]:
res1 = qa(context=context1, question=question1)

In [180]:
print(res1)

{'score': 0.6949767470359802, 'start': 33, 'end': 45, 'answer': 'Hugging Face'}


# Summarization

In [188]:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

In [189]:
text_inp1 = """
    America has changed dramatically during recent years. Not only has the number of 
    graduates in traditional engineering disciplines such as mechanical, civil, 
    electrical, chemical, and aeronautical engineering declined, but in most of 
    the premier American universities engineering curricula now concentrate on 
    and encourage largely the study of engineering science. As a result, there 
    are declining offerings in engineering subjects dealing with infrastructure, 
    the environment, and related issues, and greater concentration on high 
    technology subjects, largely supporting increasingly complex scientific 
    developments. While the latter is important, it should not be at the expense 
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other 
    industrial countries in Europe and Asia, continue to encourage and advance 
    the teaching of engineering. Both China and India, respectively, graduate 
    six and eight times as many traditional engineers as does the United States. 
    Other industrial countries at minimum maintain their output, while America 
    suffers an increasingly serious decline in the number of engineering graduates 
    and a lack of well-educated engineers.
"""

In [197]:
res1 = summarizer(text_inp1, min_length=40, max_length=80)

In [198]:
print(res1)

[{'summary_text': ' America has changed dramatically during recent years . The number of engineering graduates in the U.S. has declined in traditional engineering disciplines such as mechanical, civil, electrical, chemical, and aeronautical engineering . Rapidly developing economies such as China and India continue to encourage and advance the teaching of engineering .'}]


# Translation

In [203]:
translator1 = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")

In [204]:
source_text1 = "Ce cours est produit par Hugging Face."

In [205]:
res1 = translator1(source_text1)

In [206]:
print(res1)

[{'translation_text': 'This course is produced by Hugging Face.'}]


In [207]:
translator2 = pipeline("translation", model="salesken/translation-spanish-and-portuguese-to-english")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [208]:
source_text2 = "me gustan las peliculas existencialistas."

In [209]:
res2 = translator2(source_text2)

In [210]:
print(res2)

[{'translation_text': 'I like existential films.'}]
