In [6]:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")

# Define a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to the end-of-sequence token

# Example articles
articles = [
    {
        "title": "Protests Erupt in the Capital",
        "description": "Citizens are rallying against new legislation."
    },
    {
        "title": "Hurricane Causes Widespread Damage",
        "description": "The hurricane has left thousands homeless."
    },
    {
        "title": "A New Vaccine Offers Hope",
        "description": "Scientists have developed a vaccine that is 90% effective."
    },
    {
        "title": "Earthquake Rocks the City",
        "description": "A major earthquake has struck, causing extensive damage."
    }
]

# Define categories
categories = [
    "Terrorism / protest / political unrest / riot",
    "Positive / Uplifting",
    "Natural Disasters",
    "Others"
]

# Function to generate embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings

# Generate embeddings for categories
category_embeddings = [get_embeddings(category) for category in categories]

# Function to classify articles
def classify_article(article):
    full_text = f"{article['title']}: {article['description']}"
    article_embedding = get_embeddings(full_text)

    # Calculate cosine similarity
    similarities = []
    for category_embedding in category_embeddings:
        sim = torch.nn.functional.cosine_similarity(article_embedding, category_embedding)
        similarities.append(sim.item())
    
    # Get the category with the highest similarity
    max_index = similarities.index(max(similarities))
    return categories[max_index]

# Classify each article
for article in articles:
    category = classify_article(article)
    print(f"Title: {article['title']}\nCategory: {category}\n")


Loading checkpoint shards: 100%|██████████| 2/2 [00:23<00:00, 11.61s/it]
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


AttributeError: 'CausalLMOutputWithPast' object has no attribute 'last_hidden_state'

In [16]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel

# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

# Define a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to the end-of-sequence token

# Example articles
articles = [
    {
        "title": "Protests Erupt in the Capital",
        "description": "Citizens are rallying against new legislation."
    },
    {
        "title": "Hurricane Causes Widespread Damage",
        "description": "The hurricane has left thousands homeless."
    },
    {
        "title": "A New Vaccine Offers Hope",
        "description": "Scientists have developed a vaccine that is 90% effective."
    },
    {
        "title": "Earthquake Rocks the City",
        "description": "A major earthquake has struck, causing extensive damage."
    }
]

# Define categories
categories = [
    "Terrorism / protest / political unrest / riot",
    "Positive / Uplifting",
    "Natural Disasters"
]

# Function to generate embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)  # Move inputs to GPU
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling over the last hidden state
    return embeddings

# Generate embeddings for categories
category_embeddings = [get_embeddings(category) for category in categories]

# Function to classify articles
def classify_article(article):
    full_text = f"{article['title']}: {article['description']}"
    article_embedding = get_embeddings(full_text)

    # Calculate cosine similarity
    similarities = []
    for category_embedding in category_embeddings:
        sim = torch.nn.functional.cosine_similarity(article_embedding, category_embedding)
        similarities.append(sim.item())
    
    # Get the category with the highest similarity
    max_index = similarities.index(max(similarities))
    max_sim = max(similarities)
    if max_sim>0.6:
        return categories[max_index],max_sim
    else :
        return "Others",max_sim

# Classify each article
for article in articles:
    category,max_sim = classify_article(article)
    print(f"Title: {article['title']}\nCategory: {category}\n Similarity: {max_sim}")


Title: Protests Erupt in the Capital
Category: Terrorism / protest / political unrest / riot
 Similarity: 0.657482385635376
Title: Hurricane Causes Widespread Damage
Category: Natural Disasters
 Similarity: 0.658108115196228
Title: A New Vaccine Offers Hope
Category: Others
 Similarity: 0.5682249665260315
Title: Earthquake Rocks the City
Category: Natural Disasters
 Similarity: 0.6053272485733032
