Source: https://learn.deeplearning.ai/courses/open-source-models-hugging-face/

#**1. Pipeline demo + LLMs are stateless**

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model="openai-community/gpt2")

In [None]:
user_message = """
What are some fun activities I can do in the winter?
"""

conversation = pipe(user_message)
print(conversation)

In [None]:
print(pipe("What else do you recommend?"))

#**2. English to French translation + summarization**

In [None]:
!pip install transformers
!pip install torch

In [None]:
from transformers import pipeline
import torch

translator = pipeline(task="translation",
                      model="facebook/nllb-200-distilled-600M",
                      torch_dtype=torch.bfloat16)

In [None]:
text = """\
My puppy is adorable, \
Your kitten is cute.
Her panda is friendly.
His llama is thoughtful. \
We all have nice pets!"""

In [None]:
text_translated = translator(text,
                             src_lang="eng_Latn",
                             tgt_lang="fra_Latn")

print(text_translated)

Free up some memory:

In [None]:
import gc
del translator
gc.collect()

In [None]:
summarizer = pipeline(task="summarization",
                      model="facebook/bart-large-cnn",
                      torch_dtype=torch.bfloat16)

In [None]:
text = """Paris is the capital and most populous city of France, with
          an estimated population of 2,175,601 residents as of 2018,
          in an area of more than 105 square kilometres (41 square
          miles). The City of Paris is the centre and seat of
          government of the region and province of ÃŽle-de-France, or
          Paris Region, which has an estimated population of
          12,174,880, or about 18 percent of the population of France
          as of 2017."""

summary = summarizer(text,
                     min_length=10,
                     max_length=100)

print(summary)

#**3. Sentence embeddings**

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
sentences1 = ['The cat sits outside',
              'A man is playing guitar',
              'The movies are awesome']

embeddings1 = model.encode(sentences1, convert_to_tensor=True)
print(embeddings1)

In [None]:
sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

embeddings2 = model.encode(sentences2, convert_to_tensor=True)
print(embeddings2)

In [None]:
from sentence_transformers import util

cosine_scores = util.cos_sim(embeddings1,embeddings2)
print(cosine_scores)

In [None]:
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i],
                                                 sentences2[list(cosine_scores[i]).index(max(list(cosine_scores[i])))],
                                                 cosine_scores[i][list(cosine_scores[i]).index(max(list(cosine_scores[i])))]))

#**4. Zero-shot Audio classification**

In [None]:
!pip install transformers
!pip install datasets
!pip install soundfile
!pip install librosa

In [None]:
!pip install -U datasets[audio]

In [None]:
from datasets import load_dataset, load_from_disk

# This dataset is a collection of different sounds of 5 seconds
# dataset = load_from_disk("./models/ashraq/esc50/train")
dataset = load_dataset("ashraq/esc50", split="train[0:10]")

In [None]:
audio_sample = dataset[0]
audio_sample

In [None]:
from IPython.display import Audio as IPythonAudio
IPythonAudio(audio_sample["audio"]["array"],
             rate=audio_sample["audio"]["sampling_rate"])

In [None]:
from transformers import pipeline

zero_shot_classifier = pipeline(
    task="zero-shot-audio-classification",
    model="laion/clap-htsat-unfused")

In [None]:
zero_shot_classifier.feature_extractor.sampling_rate

In [None]:
audio_sample["audio"]["sampling_rate"]

In [None]:
from datasets import Audio

dataset = dataset.cast_column(
    "audio",
     Audio(sampling_rate=48_000))

audio_sample = dataset[0]
audio_sample

In [None]:
candidate_labels = ["Sound of a dog",
                    "Sound of vacuum cleaner"]

zero_shot_classifier(audio_sample["audio"]["array"],
                     candidate_labels=candidate_labels)

In [None]:
candidate_labels = ["Sound of a child crying",
                    "Sound of vacuum cleaner",
                    "Sound of a bird singing",
                    "Sound of an airplane"]

zero_shot_classifier(audio_sample["audio"]["array"],
                     candidate_labels=candidate_labels)

The model tries to find the most plausible label among given options.

#**5. Automatic speech recognition**

In [None]:
!pip install transformers
!pip install -U datasets
!pip install soundfile
!pip install librosa

In [None]:
from datasets import load_dataset

dataset = load_dataset("librispeech_asr",
                       split="train.clean.100",
                       streaming=True
                       )

In [None]:
dataset_head = dataset.take(5)
list(dataset_head)

In [None]:
list(dataset_head)[2]

In [None]:
example = next(iter(dataset))
example

In [None]:
from IPython.display import Audio as IPythonAudio

IPythonAudio(example["audio"]["array"],
             rate=example["audio"]["sampling_rate"])

In [None]:
from transformers import pipeline

asr = pipeline(task="automatic-speech-recognition", model="distil-whisper/distil-small.en")

In [None]:
asr.feature_extractor.sampling_rate

In [None]:
example['audio']['sampling_rate']

In [None]:
asr(example["audio"]["array"])

In [None]:
example["text"]