<a href="https://colab.research.google.com/github/enmwmak/Teaching/blob/main/EIE558/tut/Pretrained_Transformer_Examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preparation

In [None]:
# Install the necessary library
!pip install transformers torch

In [None]:
# Check the installation
import torch
from transformers import pipeline

## Sentiment Analysis

In [None]:
# 1. Load a pre-trained sentiment analysis model
# The 'sentiment-analysis' pipeline automatically loads a suitable model and tokenizer.
# You can specify a particular model if desired, e.g., 'distilbert-base-uncased-finetuned-sst-2-english'
#classifier = pipeline('sentiment-analysis')
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

# 2. Prepare your input text
text_to_analyze = [
    "This movie was absolutely fantastic! I loved every minute of it.",
    "The service was terrible and I'm very disappointed.",
    "It was an okay experience, nothing special."
]

# 3. Perform sentiment analysis
results = classifier(text_to_analyze)

# 4. Display the results
print("Sentiment Analysis Results:")
for i, result in enumerate(results):
    print(f"Text: \"{text_to_analyze[i]}\"")
    print(f"Label: {result['label']}, Score: {result['score']:.4f}\n")


In [None]:
# Example of using a specific model for a different task (e.g., masked language modeling)
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load a pre-trained BERT model and tokenizer for masked language modeling
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

# Create an input sequence with a masked token
text_mlm = "The capital of France is [MASK]."
inputs = tokenizer(text_mlm, return_tensors="pt")

# Predict the masked token
with torch.no_grad():
    logits = model(**inputs).logits

masked_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
predicted_token_id = logits[0, masked_token_index].argmax(axis=-1)
predicted_token = tokenizer.decode(predicted_token_id)

print(f"Predicted masked token for \"{text_mlm}\": {predicted_token}")

## Text Generation

In [None]:
from transformers import pipeline

# 1. Initialize the text-generation pipeline
generator = pipeline("text-generation", model="distilgpt2")

# 2. Define a text prompt to start the generation
prompt = "Briefly outline the history of China."

# 3. Generate text from the prompt
#   max_new_tokens: The maximum number of new tokens to generate.
#   num_return_sequences: How many different sequences to generate.
generated_text = generator(
    prompt,
    max_new_tokens=50,
    num_return_sequences=3
)

# 4. Print the generated text
for i, sequence in enumerate(generated_text):
    print(f"Generated Sequence {i+1}: {sequence['generated_text']}\n")

## Text-to-speech Synthesis

In [None]:
!pip install datasets==3.6.0

In [None]:
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
from IPython.display import Audio

# 1. Initialize the processor and text-to-speech model
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# 2. Define the text to be synthesized
text = "Hello, my friends. I am a Speech T5 model, and I can convert your text into speech."

# 3. Process the text and get the model inputs
inputs = processor(text=text, return_tensors="pt")

# 4. Load a speaker embedding
# A speaker embedding helps the model synthesize speech in a specific voice.
# Here, we load an x-vector embedding from a dataset for a male speaker.
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# 5. Generate the speech waveform
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

# 6. Play the generated audio
Audio(speech.numpy(), rate=16000)

## Speech Feature Extraction

In [None]:
# Install necessary libraries if not already installed
!pip install transformers datasets torchaudio librosa

In [None]:
import torch
import torchaudio
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np

# 1. Load a pre-trained model and feature extractor
# We'll use a Wav2Vec2 model pre-trained for speech recognition,
# but we'll focus on extracting features before the classification head.
model_name = "facebook/wav2vec2-base-960h"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModelForAudioClassification.from_pretrained(model_name)

# Ensure the model is in evaluation mode
model.eval()

# 2. Load a sample audio dataset
# For demonstration, we'll use a small portion of the LibriSpeech dataset.
dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True)
sample_audio = next(iter(dataset))
audio_array = sample_audio["audio"]["array"]
sampling_rate = sample_audio["audio"]["sampling_rate"]

# Resample if necessary (Wav2Vec2 typically expects 16kHz)
if sampling_rate != feature_extractor.sampling_rate:
    resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=feature_extractor.sampling_rate)
    audio_array = resampler(torch.tensor(audio_array)).numpy()
    sampling_rate = feature_extractor.sampling_rate

# 3. Extract features
# The feature extractor preprocesses the audio for the model.
# We'll get the raw hidden states from the model's encoder, which serve as features.
inputs = feature_extractor(audio_array, sampling_rate=sampling_rate, return_tensors="pt")

with torch.no_grad():
    # Pass the input through the model, but we're interested in the encoder's output
    # By default, AutoModelForAudioClassification will output logits,
    # but we can access intermediate hidden states.
    # The output structure might vary slightly between models.
    # For Wav2Vec2, 'hidden_states' will contain the outputs of each encoder layer.
    outputs = model(**inputs, output_hidden_states=True)
    # Typically, the last hidden state of the encoder is used for feature extraction.
    speech_features = outputs.hidden_states[-1]

print(f"Shape of extracted speech features: {speech_features.shape}")

# 4. Visualize a portion of the extracted features (optional)
# We'll visualize the first dimension of the features over time.
plt.figure(figsize=(12, 4))
plt.plot(speech_features[0, :, 0].numpy())
plt.title("Extracted Speech Features (First Dimension)")
plt.xlabel("Time Steps")
plt.ylabel("Feature Value")
plt.grid(True)
plt.show()

# You can now use 'speech_features' for downstream tasks like
# speaker recognition, emotion recognition, or other audio analysis.

In [None]:
# Plot features as a spectrogram
import librosa.display
plt.subplot(211)
librosa.display.specshow(librosa.amplitude_to_db(speech_features[0,:,:]).T,
                         sr=sampling_rate, y_axis='linear', hop_length=int(0.01*sampling_rate))
plt.subplot(313)
librosa.display.waveshow(audio_array, sr=sampling_rate, offset=0)
plt.margins(x=0)
plt.show()

## Machine Translation

In [20]:
import torch
from transformers import pipeline

pl = pipeline(
    task="text2text-generation",
    model="google-t5/t5-base",
    dtype=torch.float16,
    device=0
)
pl("Translate English to French: The weather is nice today.")

Device set to use cuda:0


[{'generated_text': "Le temps est agréable aujourd'hui."}]

In [21]:
pl("Translate English to French: Electrical and Electronic Engineering.")

[{'generated_text': 'Génie électrique et électronique.'}]