In [None]:
#install transformers
!pip install transformers -q

In [None]:
##Example 1: sentiment analysis

#import pipeline
from transformers import pipeline

#object
classifier = pipeline("sentiment-analysis")

result = classifier("I am happy.")
print(result)




No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


[{'label': 'POSITIVE', 'score': 0.9998760223388672}]


In [None]:
##Example 2: text generation

#import pipeline
from transformers import pipeline

#object (we can also specify a specific model here)
generator = pipeline("text-generation", model="distilgpt2")


result = generator("a person needs to eat three times per day because", max_length=100, num_return_sequences=2)
print(result)



Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "a person needs to eat three times per day because they don't have enough calories to live and is therefore unable to meet the needs of the diet or lifestyle.”If you want to eat at least 5-8 meals per day, just go ahead and go and look at the amount of calories the person could get from eating a single meal. To give that a feel check you must go to a specialist diet check out www.eatingcharts.com for more information."}, {'generated_text': "a person needs to eat three times per day because it's really hard to get enough oxygen to survive when you're in the woods. So he's taking a full-time shift just as often on a day out and the food's all going to be the same, but he's taking a full-time shift for every day and in fact he's taking quite a bit of time out. He's taking a full-time shift for every day and the food's all going to be different,"}]


In [None]:
##Example 3: text classification

#import pipeline
from transformers import pipeline

#object
classifier = pipeline("zero-shot-classification")


result = classifier("London is a beautiful city", candidate_labels=["education","politics","tourism"])
print(result)

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'sequence': 'London is a beautiful city', 'labels': ['tourism', 'education', 'politics'], 'scores': [0.8602137565612793, 0.08899707347154617, 0.05078917369246483]}


In [None]:
##Example 4: translation

#import pipeline
from transformers import pipeline

#object
translator = pipeline("translation_en_to_de")

# Translate text
translated_text = translator("Hello, how are you?")[0]['translation_text']

# Print the translated text
print(translated_text)


No model was supplied, defaulted to t5-base and revision 686f1db (https://huggingface.co/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Hallo, wie sind Sie?


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
## Tokenizers and Models


#Model: A transformer model is a machine learning model built upon the transformer architecture, which is a type of model architecture based on self-attention mechanisms.
#       These models can be used for a variety of tasks such as text classification, named entity recognition, translation, summarization, etc. The model is responsible for making predictions.


#Tokenizer: A tokenizer is used for preprocessing the text data to make it suitable for input to a transformer model.
#           This includes splitting the text into tokens, mapping these tokens to their IDs in the vocabulary, creating attention masks, and possibly other tasks, depending on the model.

#normal (default model)
classifier = pipeline("sentiment-analysis")
result = classifier("I am sad.")
print(result)

#choosing a specific model and tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
result = classifier("I am sad.")
print(result)

print('------------------------------------------')
print('---------------Understanding Tokens------------------')

sequence = "I am very excited to learn about Transformers"
result = tokenizer(sequence)
print(result)

tokens = tokenizer.tokenize(sequence)
print(tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

decoded_string = tokenizer.decode(ids)
print(decoded_string)


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'NEGATIVE', 'score': 0.9993500113487244}]
[{'label': 'NEGATIVE', 'score': 0.9993500113487244}]
------------------------------------------
---------------Understanding Tokens------------------
{'input_ids': [101, 1045, 2572, 2200, 7568, 2000, 4553, 2055, 19081, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['i', 'am', 'very', 'excited', 'to', 'learn', 'about', 'transformers']
[1045, 2572, 2200, 7568, 2000, 4553, 2055, 19081]
i am very excited to learn about transformers


In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
import os

# Set your local directory
local_dir = '/content/my_folder'

# Create the directory if it doesn't exist
if not os.path.exists(local_dir):
    os.makedirs(local_dir)

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

questions = ["Where is the German embassy in Mumbai?", "Where is the city office in Berlin?", "Where is the foreigners office in Berlin?"]
answers = ["The German embassy is located at House, 9th Floor, Hoechst, 193, Backbay Reclamation, Nariman Point, Mumbai, Maharashtra 400021, India. Contact Number: +91 22 2283 2422. ", "In Berlin, the city office is located at Kurfuerstendamm 194, 10707 Berlin. Contact Number: 030 700159829", "In Berlin, the foreigners office is located at Stuttgarter Str. 54, 12059 Berlin. Contact Number: 030 61642707"]

# Prepare the data for the T5 model
data = []
for question, answer in zip(questions, answers):
    data.append(
        tokenizer(
            "question: %s  context: %s </s>" % (question, answer),
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors="pt"
        )
    )

inputs = torch.cat([x['input_ids'] for x in data])
attention_masks = torch.cat([x['attention_mask'] for x in data])

dataset = torch.utils.data.TensorDataset(inputs, attention_masks)


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



In [None]:
training_args = TrainingArguments(
    output_dir='/content/my_folder/results',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/my_folder/logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                'attention_mask': torch.stack([f[1] for f in data]),
                                'labels': torch.stack([f[0] for f in data])},  # T5 uses the same input as label for language modeling
    train_dataset=dataset,
)

trainer.train()

# Save the model
model.save_pretrained("/content/my_folder/my_qa_model")
tokenizer.save_pretrained("/content/my_folder/my_qa_model")




Step,Training Loss


('/content/my_folder/my_qa_model/tokenizer_config.json',
 '/content/my_folder/my_qa_model/special_tokens_map.json',
 '/content/my_folder/my_qa_model/spiece.model',
 '/content/my_folder/my_qa_model/added_tokens.json')

In [None]:
question = "Where is the city office in Berlin?"

# Prepare the question for the model
input = tokenizer(
    "question: %s  context: %s </s>" % (question, ""),  # No context is provided since we want the model to generate it
    truncation=True,
    padding='max_length',
    max_length=512,
    return_tensors="pt"
)

# Generate an answer
output = model.generate(input['input_ids'], max_new_tokens=1000)  # Adjust the number of tokens as needed
print(tokenizer.decode(output[0]))


<pad> Berlin</s>
