In [None]:
from huggingface_hub import login

# Replace 'your_token' with the token you just generated
login(token="your_token")


# Import required libraries
import pandas as pd
from transformers import pipeline

# Load the email dataset
emails_df = pd.read_csv('data/email_categories_data.csv')

# Display the first few rows of our dataset
print("Preview of our email dataset:")
print(emails_df.head(2))

# Define the model name (Llama model from Hugging Face)
model_name = "meta-llama/Llama-3.2-1B-Instruct"

# Initialize the Hugging Face pipeline for text classification using Llama
classifier = pipeline("zero-shot-classification", model=model_name)

# Create the system prompt with examples (you can customize this part based on your task)
prompt = """ You classify emails into Priority, Updates, or Promotions.

Example 1:
Urgent: Password Reset Required
Your account security requires immediate attention. Please reset your password within 24 hours.
Response: Priority

Example 2:
Special Offer - 50% Off Everything!
Don't miss our biggest sale of the year. Everything must go!
Response: Promotions

Example 3:
Canceled Event - Team Meeting
This event has been canceled and removed from your calendar.
Response: Updates

Example 4:
"""

# Function to process messages and return classifications
def process_message(classifier, message, prompt):
    """Process a message and return the response"""
    input_prompt = f"{prompt} {message}"
    
    # Perform classification using the classifier
    result = classifier(input_prompt, candidate_labels=["Priority", "Updates", "Promotions"])
    
    # Extract the label with the highest score
    return result['labels'][0]  # This gives the highest scoring label

# Let's test our classifier on two emails from our dataset
test_emails = emails_df.head(2)

# Process each test email and store results
results = []
for idx, row in test_emails.iterrows():
    email_content = row['email_content']
    expected_category = row['expected_category']
    
    # Get model's classification
    result = process_message(classifier, email_content, prompt)
    
    # Store results
    results.append({
        'email_content': email_content,
        'expected_category': expected_category,
        'model_output': result
    })

# Create a DataFrame with results
results_df = pd.DataFrame(results)

# Display results
print("\nClassification Results:")
for index, result in results_df.iterrows():
    print(f"Email {index + 1}:")
    print(f" - Review: {result['email_content']}")
    print(f" - Expected Category: {result['expected_category']}")
    print(f" - Predicted Category: {result['model_output']}\n")


Preview of our email dataset:
   email_id                                      email_content  \
0         1  Urgent: Server Maintenance Required\nOur main ...   
1         2  50% Off Spring Collection!\nDon't miss our big...   

  expected_category  
0          Priority  
1        Promotions  


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cpu
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Tokenizer was not supporting padding necessary for zero-shot, attempting to use  `pad_token=eos_token`



Classification Results:
Email 1:
 - Review: Urgent: Server Maintenance Required\nOur main server needs immediate maintenance due to critical errors. Please address ASAP.
 - Expected Category: Priority
 - Predicted Category: Updates

Email 2:
 - Review: 50% Off Spring Collection!\nDon't miss our biggest sale of the season! All spring items half off. Limited time offer.
 - Expected Category: Promotions
 - Predicted Category: Promotions



In [6]:
x=10