# Step 0: Importing Required Packages



In [1]:
import pandas as pd
from scipy import stats
from transformers import pipeline

# Text Classification


In [10]:
textclassifier = pipeline(task="text-classification")
print(textclassifier("I dont like this movie, but I watched 5 times"))

# Challenge: Confuse the model! Find an input that makes the model produce the score (confidence) below 0.6

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.8032945394515991}]


# Token Classification

In [23]:
classifier = pipeline(task="token-classification")
print(classifier("Amazon"))

# Challenge: Searching or Trying? Find 5 more entity types (e.g., "I-PER", "I-LOC", etc.)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'entity': 'I-LOC', 'score': 0.99612623, 'index': 1, 'word': 'Amazon', 'start': 0, 'end': 6}]


# Fill Mask

In [28]:
classifier = pipeline("fill-mask")
print(classifier("Go to the <mask> everyday, to get <mask>"))

# Challenge: Undercover: try to increase masked words to see the model behaviour

No model was supplied, defaulted to distilbert/distilroberta-base and revision fb53ab8 (https://huggingface.co/distilbert/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[[{'score': 0.4818406105041504, 'token': 6545, 'token_str': ' gym', 'sequence': '<s>Go to the gym everyday, to get<mask></s>'}, {'score': 0.07621841132640839, 'token': 12647, 'token_str': ' supermarket', 'sequence': '<s>Go to the supermarket everyday, to get<mask></s>'}, {'score': 0.057290125638246536, 'token': 1400, 'token_str': ' store', 'sequence': '<s>Go to the store everyday, to get<mask></s>'}, {'score': 0.02142256125807762, 'token': 9367, 'token_str': ' mall', 'sequence': '<s>Go to the mall everyday, to get<mask></s>'}, {'score': 0.01669922098517418, 'token': 5560, 'token_str': ' library', 'sequence': '<s>Go to the library everyday, to get<mask></s>'}], [{'score': 0.0709790587425232, 'token': 12018, 'token_str': ' discounts', 'sequence': '<s>Go to the<mask> everyday, to get discounts</s>'}, {'score': 0.051238808780908585, 'token': 554, 'token_str': ' started', 'sequence': '<s>Go to the<mask> everyday, to get started</s>'}, {'score': 0.02387554943561554, 'token': 3496, 'token_str

# Table Question Answering

In [34]:
tqa = pipeline(task="table-question-answering")
data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
table = pd.DataFrame.from_dict(data)
question = "who has more movies?"
print(tqa(table=table, query=question)['cells'][0])

# Challenge: Predictable model! How many different answers we might see?

No model was supplied, defaulted to google/tapas-base-finetuned-wtq and revision e3dde19 (https://huggingface.co/google/tapas-base-finetuned-wtq).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Brad Pitt


# Question Answering

In [35]:
qa = pipeline(task="question-answering")
context = "Brad Pitt has 87, Leonardo Di Caprio has 53, and George Clooney has 69 movies."
question = "how many movies does Leonardo Di Caprio have?"
print(qa(question = question, context = context))

# Challenge: Let's take a deeper look! See the architecture of the model by calling .model.config

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu


{'score': 0.9822641611099243, 'start': 41, 'end': 43, 'answer': '53'}


# Zero-Shot Classification

In [38]:
zsc = pipeline(task="zero-shot-classification")
print(zsc("Inception is the best movie ever",
    candidate_labels=["CINEMA", "MUSIC", "ART"],
))

# Challenge: Unique English words! What is the vocab_size of the Tokenizer used by zsc?

No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


{'sequence': 'Inception is the best movie ever', 'labels': ['CINEMA', 'ART', 'MUSIC'], 'scores': [0.7417240738868713, 0.23560434579849243, 0.022671552374958992]}


# Translation

In [39]:
en_fr_translator = pipeline(task="translation_en_to_fr")
en_fr_translator("How far is the closest city?")

# Challenge: Multilinguality! How many languages does this task support?

No model was supplied, defaulted to google-t5/t5-base and revision a9723ea (https://huggingface.co/google-t5/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


[{'translation_text': ' quelle distance se trouve la ville la plus proche ?'}]

# Summarization

In [40]:
summarizer = pipeline(task="summarization")
summarizer("Paris is the capital and most populous city of France, with an estimated population of 2,175,601 residents as of 2018, in an area of more than 105 square kilometres (41 square miles). The City of Paris is the centre and seat of government of the region and province of Île-de-France, or Paris Region, which has an estimated population of 12,174,880, or about 18 percent of the population of France as of 2017.")

# Challenge: SHORTEEERRR! Force the model to keep the summary under 10 words.

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 142, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


[{'summary_text': ' Paris is the capital and most populous city of France, with an estimated population of 2,175,601 residents as of 2018 . The city is the centre and seat of government of the region and province of Île-de-France, or Paris Region . Paris Region has an estimated 18 percent of the population of France as of 2017 .'}]

# Text Generation

In [41]:
generator = pipeline(task="text-generation")
generator("Hello, I'm a student at", num_return_sequences=2)

# Challenge: Lullaby! Force the model to tell you a single long story.

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a student at a prestigious university. I live in Germany.\n\nIn addition to my job, I work part time, sometimes for a small, small part-time organization that I like working for. I live with my parents"},
 {'generated_text': "Hello, I'm a student at California Tech. My job for some time was to go up in the sky out of order to capture and report things. I didn't have an instructor..."}]

# Feature Extraction

In [42]:
feature_extractor = pipeline("feature-extraction", framework="pt")
text = "Transformers is an awesome library!"
feature_extractor(text,return_tensors = "pt")[0].numpy().mean(axis=0)

# Challenge: Does fraework matter? pt stands for PyTorch. Will we get the same output if we use TensorFlow?

No model was supplied, defaulted to distilbert/distilbert-base-cased and revision 6ea8117 (https://huggingface.co/distilbert/distilbert-base-cased).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu


array([ 3.64021957e-01,  3.18241090e-01, -1.62191372e-02,  1.11089326e-01,
        4.01385725e-02, -1.69426396e-01,  2.77342737e-01,  7.25674927e-02,
        8.48770440e-02, -1.35105103e-01,  8.68056789e-02,  1.67308599e-01,
       -1.33444577e-01,  3.45962167e-01, -5.39072096e-01, -5.93828186e-02,
       -1.25284672e-01,  2.32461393e-02,  3.71414870e-02, -8.08689445e-02,
        1.34332746e-01, -2.08202809e-01, -1.03569373e-01, -1.17489398e-02,
       -1.02696173e-01, -1.00813568e-01,  4.27819550e-01,  7.39083529e-01,
       -1.74560726e-01,  4.99700010e-01, -7.20076589e-03,  4.23863530e-04,
       -2.33920775e-02,  2.97357403e-02, -2.96980590e-01,  2.44169131e-01,
        8.18386599e-02,  2.48571381e-01, -2.17247337e-01, -3.64355236e-01,
       -1.26975119e-01,  3.87528896e-01,  2.28286758e-02, -2.81162024e-01,
        1.11271627e-01, -1.22055203e-01, -1.85853451e-01, -2.87077039e-01,
       -3.40078145e-01,  1.18320510e-01, -4.97630239e-03, -6.59284651e-01,
       -1.24732092e-01,  

# Sentence Similarity

In [43]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
sentences = [
    "Competition day is next week",
    "Mastering this will greatly help"]

embeddings = model.encode(sentences)
similarities = model.similarity(embeddings, embeddings)
print(similarities)


# Challenges:
    # What was different for this task? Try pipeline('sentece-similarity')
    # Try to add more sentences
    # what is the vector size (embedding dimension)?
    # Try words instead of sentences

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tensor([[ 1.0000, -0.0090],
        [-0.0090,  1.0000]])
