# My Tutorial on HuggingFace

# Chapter 1

In [1]:
from transformers import pipeline

In [2]:
# sentiment pipeline performs text classification on input and determines if it is positive or negative and outputs the given confidence interval using 'score'

classifier = pipeline("sentiment-analysis")
classifier("I'm watching an hour long Natural Language Processing Video to understand this process")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


[{'label': 'NEGATIVE', 'score': 0.9550794363021851}]

In [3]:
# pass multiple statements together for pipeline to process together as a patch

classifier(["I am very excited for New Years Eve!", 
"I hate not being able to travel as much due to COVID-19."])

[{'label': 'POSITIVE', 'score': 0.9998338222503662},
 {'label': 'NEGATIVE', 'score': 0.9997290968894958}]

In [4]:
# zero text classification pipeline for more general text classification, in this case taking the input text is related to the labels education, business, or programming

classifier = pipeline("zero-shot-classification")
classifier("This course is about learning to explore and visualizing real-world datasets using Python", 
candidate_labels=["education","business","programming"])

No model was supplied, defaulted to facebook/bart-large-mnli (https://huggingface.co/facebook/bart-large-mnli)


{'sequence': 'This course is about learning to explore and visualizing real-world datasets using Python',
 'labels': ['programming', 'education', 'business'],
 'scores': [0.66228187084198, 0.32246798276901245, 0.015250138007104397]}

In [5]:
# generation pipeline auto-completes a given prompt and the output is generated with a bit of randomness so it changes each time
# the text I am using in this example is from the APAN website for Python class description 
# the sentence is: The students in this course will learn to examine raw data with the purpose of deriving insights and drawing conclusions

generator = pipeline("text-generation")
generator("The students in this course will learn to examine raw data with the")

No model was supplied, defaulted to gpt2 (https://huggingface.co/gpt2)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The students in this course will learn to examine raw data with the use of tools and techniques in order to solve the problem, and learn practical tools that will enable students to focus on solving and developing best practice. The following chapters cover topics such as:'}]

In [6]:
# when a model is not explicitely provided, the default model for each associated task is selected. However, you can select any model that has been pretrained for a task on https://huggingface.co/models
# lets do text generation but with another model, jpg2 and see what happens (this is a lighter version for the gpd2 model)
# we can specify several arguments like max length of generated text and number of sentences we want to return bc there is some randomness in the generation

generator = pipeline("text-generation", model="distilgpt2")
generator(
    "The students in this course will learn to examine raw data with the <mask> of deriving insights and drawing conclusions", 
    max_length=30, 
    num_return_sequences=2,
)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The students in this course will learn to examine raw data with the <mask> of deriving insights and drawing conclusions on their knowledge.'},
 {'generated_text': 'The students in this course will learn to examine raw data with the <mask> of deriving insights and drawing conclusions from the same source. This practice'}]

In [7]:
# the fill-mask pipeline had a pretraining objective to guess the value of missing words in a sentence.
# in this case, we ask for the 2 most likely values ofr missing words according to the model

unmasker = pipeline("fill-mask")
unmasker("The students in this course will learn to examine raw data with the purpose of deriving <mask> and drawing conclusions", top_k=2)

No model was supplied, defaulted to distilroberta-base (https://huggingface.co/distilroberta-base)


[{'sequence': 'The students in this course will learn to examine raw data with the purpose of deriving hypotheses and drawing conclusions',
  'score': 0.2825345993041992,
  'token': 44850,
  'token_str': ' hypotheses'},
 {'sequence': 'The students in this course will learn to examine raw data with the purpose of deriving trends and drawing conclusions',
  'score': 0.08611495047807693,
  'token': 3926,
  'token_str': ' trends'}]

In [8]:
# the NER pipelines classifies each word in a sentence instead of the sentence as wall. For example, using named entity recognition such as persons, organizations, or locations in a sentence
# the group pipeline entity use is to make the pipeline group together different words linked to the same entity such as New York and Goldman Sachs

ner = pipeline("ner", grouped_entities=True)
ner("My name is Michael and I work at Goldman Sachs in Manhattan.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english)


[{'entity_group': 'PER',
  'score': 0.99904186,
  'word': 'Michael',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.99903166,
  'word': 'Goldman Sachs',
  'start': 33,
  'end': 46},
 {'entity_group': 'LOC',
  'score': 0.99840856,
  'word': 'Manhattan',
  'start': 50,
  'end': 59}]

In [9]:
# another task avaialble with a pipeline api is extractive question answering providing a context and a question the mdoel will identify the span of text containing the answer to the question

question_answerer = pipeline("question-answering")
question_answerer(
    question="Where do I work?", 
    context="My name is Michael and I work at Goldman Sachs in Manhattan."
)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


{'score': 0.7812808752059937,
 'start': 33,
 'end': 46,
 'answer': 'Goldman Sachs'}

In [10]:
# getting summaries of long text is also a task that transformers libraries can help with using the summarization pipeline

summarizer = pipeline("summarization")
summarizer("""Spotify is not joking around amid a dispute over royalties for comedy content. The streaming giant has removed the work of hundreds of comedians from its platform -- including Tiffany Haddish, Kevin Hart and the late Robin Williams -- according to rights agency Spoken Giants.

Spoken Giants, which represents some of the affected comedians, describes itself as "the first global rights administration company for the owners and creators of spoken word copyrights," and aims to get streaming platforms to pay comedians for writing jokes in the same way songwriters are paid.
The group told CNN that the take-down happened on November 24, and said that it never requested the content's removal.
"Unfortunately, Spotify removed the work of individual comedians rather than continue to negotiate," CEO Jim King told CNN.
"With this take-down, individual comedians are now being penalized for collectively requesting the same compensation songwriters receive," he added. "After Spotify removed our members' work, we reached out but have not received a response. We have now requested an immediate meeting to resolve this situation."
A Spotify spokesperson told CNN that the streaming platform had already paid "significant amounts of money" to offer the comedy content to listeners, and "would love to continue to do so."
"However, given that Spoken Giants is disputing what rights various licensors have, it's imperative that the labels that distribute this content, Spotify and Spoken Giants come together to resolve this issue to ensure this content remains available to fans around the globe," the spokesperson said.
Although the content is still available on other platforms including Pandora and Sirius, Spoken Giants said comedians with lower profiles and revenues could suffer from losing Spotify as a platform.
On social media, New York-based comedian Joe Zimmerman called the move "corporate bullying." Another New York-based comedian, Liz Miele, tweeted that her albums had also been removed from the platform because comedians "had the audacity to ask for money owed to us," and jokingly compared herself to singer Taylor Swift.
Swift was previously engaged in a dispute with Spotify, arguing artists were not paid enough. The singer pulled her entire catalog from the platform in 2014, but reversed her decision in 2017
""")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


[{'summary_text': ' Spotify has removed the work of hundreds of comedians from its platform . Comedians include Robin Williams, Tiffany Haddish, Kevin Hart and Robin Williams . Spoken Giants, which represents some of the affected comedians, says it never requested the content\'s removal . A Spotify spokesperson says the streaming platform had already paid "significant amounts of money" to offer the comedy content to listeners .'}]

In [13]:
# I would translate this more as there is a little truth behind every joke

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")
translator("Entre broma y broma la verdad se asoma.")

[{'translation_text': 'Between jokes and jokes the truth comes out.'}]

In [16]:
# test out en zh translator

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-zh")
translator("There is a little truth behind every joke.")

[{'translation_text': '每一个笑话背后都有一点点真相。'}]

# Decoders, Encoders, and Transformers
####  decoders are trained to guess next word which is why they are good at generating text,
#### encoder models are pre-trained by filling random mask in a sentence so better at output,
#### transformers models are just massive,
#### lets build summarization model and assess accuracy using zero shot on both the article and summary, bleu, and a custom scoring methodology

# Transfer Learning
#### transfer learning is when take pre-trained model on a lot of data and reuse that model to fine-tune it on new task you want to work with - so by reusing the model instead of doing one from scratch means you needs less data to train your model
#### gpd2 was pre-trained using the content of 45 million links posted by users on reddit - good model for guessing next word in sentence
#### predict value of randmoly masked words - bert was retrained this way using english wikipedia and 11,000 unpublished books
#### but pre-trained model contains knowledge but also bias it already has, ex: imagenet contains images from united states and western europe so models fine-tuned on it usually do better on images from these countries

# Transformer Architecture

#### decoders, encoders, sequence to sequence (encoder-decoder)
#### encoder converts to numerical - is bi-directional, self-attention (inputs)
#### decoder "decodes" the representations from the encoder (output probabilities), uni-directional, auto-regressive  traditional use, masked self-attention
#### encoder-decoder (sequence to sequence) combines decoder and encoder, the encoder accepts inputs and computes a high level representation of those inputs, these outputs are then passed to decoder that uses these outputs alongside other inputs to generate a prediction output which it will use in future - hence auto-regressive

#### encoders - the attention mechanism is allowed to look at every word in the sentence so the word before and after, like BERT model, when need to guess value of masked word it is useful to look at what was before and after
#### decoder - like gpt have to predict the next word so if they were allowed to look at word after, it would be cheating so theyt are only allowed to look at what was before

# Encoder Models

#### BERT is popular encoder model, ALBERT, ELECTRA, RoBERTa, DistilBERT
#### retreives numerical representation of each word
#### encoder outputs one sequence of numbers per input word aka feature vector/tensor
#### each vector is numerical representation of word and the dimension of that vector is defined by the architecture of the model - for base BERT model it is 768
#### these vectors contain the value of a word but contextualized, for example the vector representation of the word "to" isnt just "to", it also takes iinto account the words around it which are called context (right and left context)
#### "Welcome to NYC" - for the word "to" the left context is "Welcome" and the right context is "NYC" -- output is based on contexts so it is a contextualized vector thanks to self-attention mechanism
#### self attention mechanism relates to different positions of different words in a single sequence in order to calculate
#### should use encoder as stand alone models for sequence classification, question answering, and masked language modeling, etc... very powerful at extracting vectors that are meaningful in a sequence 
#### encoders really shine in mask langauge modeling MLM predict hidding word in sequence of words -- BERT was trained on this bc bi-directional inforamtion is crucial in this task
#### requires semantic understanding of as well as syntactic understanding
#### encoders are good at sequence classifrication like sentiment-analysis - aim is to identify sentiment of a sequence like 1-5 stars for review, positive or negative rating of a sequence - even if words are same sequence can make something mean something completely different
#### BERT has maximum length of 512 words so generally cannot use input of greater than 512 words in model, some models like long former can accept longer context so should read documentation for specific encoders and their abilities, can split sentence into several parts of 512 words and pass each of those chunks into model and can average what you get at the end to try to train a classifier for larger sentences

# Decoder Models

#### popular decoder model is gpt2
#### can use generally for same tasks as decoders but with little loss on performance
#### "Welcome to NYC" -- gets numerical representation for each word, outputs one sequence of numbers per input word, this is a feature vector/tensor
#### one vector per word passed through decoder, each vector is numerical represntation of word, dimension of vector defined by the architecture of the model
#### differs from encoder is with self attention mechanism - uses masked self attention
#### wrod "to" would be unmodified by NYC word bc right context of word will all be masked, doesnt benefit from bi-directional context, only single context depending on what is masked
#### self-attention mechanism- provides additional mask to hide context in one direction, so vector is not affected by words in the hidden context
#### should use as standalone models generate numerical, and can be used in wide variety of task, but having only access to left context would be great at text generation / ability to generate word or sequence of words given a string of words aka causal langauge modeling or natual langauge generation
#### causal language modeling -- start with word "my" outputs vector of numbers which could be single word that maps to all words known by model (language modeling head) - and will predict most probabile following words, adds that to sequence so if picks "name" then uses "my name" to predict next word aka auto-regressive
#### passes "my name" through decoder and then says "is" -- gpt2 has maximum context side of 1024 words

# Sequence to Sequence Models (combine encoders and decoders) / Encoder-Decoder Architecture

#### popular encoder-decode model is t5, BART, ProphetNet, mT5, M2M100, Pegasus, MarianMT, mBART, etc...
#### encoder takes words as inputs passes through encoder, and gives numerical representation for each word passed through, and contains info about meaning of sequence
#### decoder is passed outputs from encoder directly, in addition given a sequence sequence, when prompting decoder with no initial sequence, we can give it the value that indicates the start of a sequence (this is where magic happens)
#### encoder accepts sequence as input and computes a prediction and outputs a numerical representation, then sends that over to decode, it has in a sense  "encodede" that sequence and the decoder uses this input alongside its usual sequence input will try to decode the sequence
#### the decode decodes the sequence and outputs the word, dont need to make sense of it, but decode is decoding what encoder has output, start of sequence word indicates sohuld start decoding sequence - so dont need encoder anymore
#### decode acts in auto-regressive manner so can take what was just output as an input - this in combination with numerical representation provided by encoder can be used to generate a second word and continues on and on untild ecode outputs value that is a stopping value like a . indicating end of a sequence
#### so we ahve initial sequence sent to encoder, encoder output sent to decoder to be decoder, then can discard encoder after single use - then decoder can be used several times until we have generated every word that we need
#### transduction (act of translating a sequence) -- can use transformer model built for that task -- take "welcome to nyc" translate to french - encoder translates words and pases welcome to bienvenue as start of sequence word to putput first word bienvenue and then use bienvenue is used as input sequence for deecode, this along with decoder numerical representation to predict second word "to" then uses bienvenue a and then can predict "nyc"
#### encoder and decode often do not share weights - so very good - entire block encoder and be trained to understand sequence and extract relevant information, for translation scenario would mean parsing and understanding what was said in english, and extracting info from that language and put in vector dense information, then have decoder whose sole purpose is to decode numerical representation output by encoder, this decoder can be specialized in completely different lagnauge or modality like images
#### encoder-decoder models are able to manage sequence to sequence tasks like translation, weights between encoder and decoder are not necessarily shared
#### translate "transformers are power" in french takes 3 english words and outputs 4 french words les transformers sont puissants -- decoder can do this due to auto-regression standalone
#### summarization very strong - since encoder and decoder are seperate can have very long context for encoder to handle text and smaller context for decoder which handles summarize sequence
#### can also load an encoder and decoder inside an encoder-decoder model, so can choose to use specific encoders and decoders which are good at specific tasks - customizable
#### cool to paraphrase from formal to informal using encoder-decoder

# Bias and Limitations

#### powerful but dont control input to output, more controlled by training so need precautions to avoid predictions you dont want in deployment
#### BERT has been pretrained in filling masked words - change gender outputs completely different items - this MAN works at more likely to output (carpenter, doctor, mechanic), female more likely to output (maid, nurse, waitress, teacher, prostitute)
#### gpd2 trained on internet reddit so more sexist, etc...
#### these bias will persist even after fine-tuning - need enough samples of outputs want to see and always put model in production after analyzing results - if want to avoid some outputs try to provide more training data that correct the bias of the model

# Chapter 2 - Transformers, Model APIs, and Tokenizers used to convert text into format models can process

# What happens in pipeline function

#### How sentiment pipeline went from positive to negative? converts raw text to numbers using a tokenizer, then those numbers go through model which outputs logits, then post-processing steps transform those delegates into labels and scores
#### Tokenization process - text split into tokens, then tokenizer adds special tokens if it expects them to classify, then tokenizer matches each token to unique id in vocabulary of tokenizer - autotokeniszer api

In [17]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [20]:
# Step 1 - Tokenizer

raw_inputs = [
    "I have been waiting for a HuggingFace course my whole life.", 
    "I hate this so much!",
]

# Because two sentences passed in this model are not of the same size, need to pad the shortest one to be able to build an array using padding=True
# Truncation=True is used to make sure that any sentence longer than the maximum the model can handle is truncated
# return_tensors option tells the tokenizer to return the pytorch tensor

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")\

# result is dictionary with 2 keys, input ids contains ids of both sentences with zero padding applied
# the second, attention_mask, indicates that padding has been applied so the model does not pay attention to it
inputs

{'input_ids': tensor([[  101,  1045,  2031,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}

In [21]:
# Step 2 - Model

from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

# from_pretrained method will downlaod and cache configuration of model as well as the pre-trained weight. However, automodel api will on instantiate the body of the model (part of model left once pre-training head is removed)
# outputs high dimensional tensor that is a representation of the sentences passed but which is not directly useful for classification problem
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

# output torch.Size([2, 15, 768]) [batch length, sequence length, hidden size]
# this output shows the tensor has 2 sentences, each of 16 tokens, and last timension is indent size of model 768


Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([2, 15, 768])


In [22]:
# Step 2 - Model

# to get output for classifcation problem need to use autoModelForSequenceClassification
# works like automodel class but built a model with a classification head - there is one auto class for each common nlp task in transformers library

from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits)

# output tensor([[-1.3782,  1.4346],
#        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)

# tensor size of 2x2, one result for each sentence and each possible label - these outputs are not probabilities yet - can see bc they dont sum to 1
# this is because each model of the transformers library retuirns lockets - to make sens eof those logits need to look at post processing

tensor([[-1.3782,  1.4346],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [23]:
# Step 3 - Post-processing (logits to predictions)

import torch

# to convert logits to probabilities need to apply soft max layers to them

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

# the output shows that these are positive numbers that sum to 1
# last step is to know which of those correspond to the positive or negative level - given by id to label field of model config
# first index [0] probabilities correspond to negative label, the second [1] correspond to positive label


# output tensor([[5.6636e-02, 9.4336e-01],
#        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)




tensor([[5.6636e-02, 9.4336e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


# More coming shortly