In [1]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

In [2]:
def cosine_similarity(vec1, vec2):
    return torch.nn.functional.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0))

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [4]:
sentence = 'is banana and apple the same?'
tokenized_sentence = tokenizer(sentence, return_tensors='pt')

In [5]:
tokenized_sentence

{'input_ids': tensor([[  101,  2003, 15212,  1998,  6207,  1996,  2168,  1029,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [6]:
tokenizer.convert_ids_to_tokens(101)

'[CLS]'

In [7]:
outputs = model(**tokenized_sentence)

In [8]:
len(sentence.split())

6

In [9]:
outputs.last_hidden_state.shape

torch.Size([1, 9, 768])

In [10]:
banana_embedding = outputs.last_hidden_state[:, 2, :]

In [11]:
print(banana_embedding.shape)

torch.Size([1, 768])


In [12]:
apple_embedding = outputs.last_hidden_state[:, 4, :]

In [13]:
same_embedding = outputs.last_hidden_state[:, 6, :]

In [17]:
similarity = cosine_similarity(apple_embedding, banana_embedding)

In [18]:
token_embds = outputs.last_hidden_state[0]

In [19]:
token_embds.shape

torch.Size([9, 768])

In [20]:
banana_idx = 2
apple_idx = 4
same_idx = 6

In [21]:
banana_emd = token_embds[banana_idx]
apple_emd = token_embds[apple_idx]
same_emd = token_embds[same_idx]

In [22]:
banana_emd.shape

torch.Size([768])

In [23]:
cosine_similarity(banana_emd, apple_emd).item()

0.7752277851104736

In [24]:
cosine_similarity(banana_emd, same_emd).item()

0.37262046337127686

In [43]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)  # Dot product
    norm_vec1 = np.linalg.norm(vec1)  # Magnitude of vec1
    norm_vec2 = np.linalg.norm(vec2)  # Magnitude of vec2
    return dot_product / (norm_vec1 * norm_vec2)

In [45]:
cosine_similarity(banana_emd.detach().numpy(), apple_emd.detach().numpy())

0.7752276

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load pre-trained BERT for sequence classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Input text
text = "I love this product! It's amazing."

# Tokenize input
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Get predictions
outputs = model(**inputs)
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()

# Interpret the result
sentiment = "Positive" if predicted_class == 1 else "Negative"
print(f"Sentiment: {sentiment}")


In [46]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load tokenizer and pre-trained model fine-tuned for sentiment analysis
tokenizer = BertTokenizer.from_pretrained('textattack/bert-base-uncased-SST-2')
model = BertForSequenceClassification.from_pretrained('textattack/bert-base-uncased-SST-2')

# Input text
text = "I absolutely loved this movie! The story and acting were fantastic."

# Tokenize input
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Forward pass
outputs = model(**inputs)
logits = outputs.logits  # Logits for each class (positive, negative)

# Convert logits to probabilities
probs = torch.nn.functional.softmax(logits, dim=-1)
predicted_class = torch.argmax(probs).item()

# Interpret result
classes = ["Negative", "Positive"]
print(f"Sentiment: {classes[predicted_class]} (Confidence: {probs.max().item():.2f})")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Sentiment: Positive (Confidence: 1.00)


# Fine Tune

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Example data (replace with your dataset)
data = {
    "text": [
        "He scored 30 goals last season and is a top player.",
        "His performance has been poor with many missed opportunities.",
        "An incredible midfielder with great passing skills.",
        "Struggled to make an impact in every game.",
    ],
    "label": [1, 0, 1, 0],
}

# Create DataFrame
df = pd.DataFrame(data)

# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2
)


In [48]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode sequences
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


In [49]:
import torch

class SoccerDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = SoccerDataset(train_encodings, train_labels)
val_dataset = SoccerDataset(val_encodings, val_labels)


In [50]:
%%time

from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained BERT with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Number of epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate at each epoch
    save_strategy="epoch",           # Save the model at each epoch
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.331852
2,No log,0.332179
3,No log,0.332846


CPU times: user 3.37 s, sys: 6.66 s, total: 10 s
Wall time: 22.8 s


TrainOutput(global_step=3, training_loss=1.0562281608581543, metrics={'train_runtime': 19.6844, 'train_samples_per_second': 0.457, 'train_steps_per_second': 0.152, 'total_flos': 64749986280.0, 'train_loss': 1.0562281608581543, 'epoch': 3.0})

In [53]:
!ls results

[1m[36mcheckpoint-1[m[m [1m[36mcheckpoint-2[m[m [1m[36mcheckpoint-3[m[m


In [54]:
from transformers import pipeline

# Load BART summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Input text
text = """
The COVID-19 pandemic has led to widespread changes in everyday life, including the way people work and interact. 
Many industries have adopted remote work as the new normal, while others are struggling to adapt. Vaccination efforts 
are ramping up globally, but challenges remain in ensuring equitable distribution. The pandemic has also accelerated 
the adoption of digital technologies in various sectors.
"""

# Generate summary
summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
print("Summary:")
print(summary[0]['summary_text'])


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Summary:
The COVID-19 pandemic has led to widespread changes in everyday life, including the way people work and interact. Many industries have adopted remote work as the new normal, while others are struggling to adapt.
