In [1]:
# Using CUDA - Accelerated data analysis for GPU
import cudf

# Loading the emails
df = cudf.read_csv('../data/emails.csv')

# Printing out the first three rows
df.head(3)

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...


In [2]:
# Getting a look at how many emails
print(f"The DataFrame has {df.shape[0]} emails.")  


The DataFrame has 517401 emails.


In [3]:
# Remove metadata (headers) and keep only the email body
def clean_email_body(message):
    # Split the message at the first blank line (using \n) to separate headers from the body
    body = cudf.Series(message).str.split('\n\n', expand=True)[1]
    return body.str.replace(r'[^a-zA-Z\s]', '', regex=True).str.lower()

# Apply the cleaning function
df['cleaned_message'] = clean_email_body(df['message'])

# Check the cleaned message column
df.head(3)

Unnamed: 0,file,message,cleaned_message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,here is our forecast
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,traveling to have a business meeting takes the...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful way to go


In [4]:
import time

# Categories with associated keywords (just data engineering, not ML but shows how quickly rapids works)
categories = {
    'important': ['deadline', 'asap', 'critical', 'urgent', 'quickly'],
    'illegal': ['money laundering', 'scam', 'tax fraud', 'payoff', 'shady'],
    'phishing': ['account verification', 'verify your identity', 'suspicious activity'],
    'spam': ['win', 'offer', 'free', 'click here'],
    'routine': ['meeting', 'catch up', 'hello', 'update'],
}

# Start timing
start_time = time.time()

# Create a new column to store the category
df['category'] = 'regular'  # Initialize all emails as regular

# Iterate over each category and its associated keywords
for category, keywords in categories.items():
    # Create a mask where any keyword matches in the cleaned_message
    mask = df['cleaned_message'].str.contains('|'.join(keywords), regex=True)
    
    # Assign the category to rows where the mask is True
    df['category'] = df['category'].where(~mask, other=category)

# End timing
end_time = time.time()

# Print the time taken for the operation
print(f"Time taken for categorization: {end_time - start_time} seconds")

# Newline Print
print('\n')

# Check the first few categorized emails
df.head(3)

Time taken for categorization: 2.8210206031799316 seconds




Unnamed: 0,file,message,cleaned_message,category
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,here is our forecast,regular
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,traveling to have a business meeting takes the...,routine
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful way to go,regular


In [5]:
df['category'].value_counts()


category
regular      448437
spam          33698
routine       31039
important      4161
illegal          62
phishing          4
Name: count, dtype: int64

In [6]:
# Write the DataFrame to the specified directory because I need to use the GPU for LLMs, ordinarily you'd have a separate server
df.to_csv('../data/emails_cleaned_v2.csv', index=False)
print('Exported')

Exported


In [1]:
# Reading in the clean emails, no GPU so I did not use cuDF
import pandas as pd

# Read the CSV into a pandas DataFrame
df = pd.read_csv('../data/emails_cleaned.csv')

# Check the first few rows
df.head(3)

Unnamed: 0,file,message,cleaned_message,category
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,here is our forecast,regular
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,traveling to have a business meeting takes the...,routine
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful way to go,regular


In [2]:
# Connecting to my Llama3.1 NIM 
from openai import OpenAI

# Set up the OpenAI client with the local NIM instance (remove IP)
client = OpenAI(
    base_url="http://172.29.202.76:8000/v1",  
    api_key="dummy_api_key"  
)

In [3]:
# Extracting knowledge using the LLM with a fallback for short emails
def extract_knowledge(email_text):
    system_prompt = "You are an expert in extracting key facts and insights from emails."

    # Define a threshold for what we consider "short"
    word_count = len(email_text.split())
    if word_count < 10:  # Threshold for short emails (originally was 5)
        return "The email content is too brief to extract meaningful knowledge."

    # Knowledge extraction prompt for regular emails
    user_prompt = f"""Extract the key pieces of information from the following email:
    
    Email content: "{email_text}"
    
    Identify any important facts, dates, names, instructions, or other relevant information. Summarize the key knowledge from this email."""

    # Create a chat completion request to the LLM
    completion = client.chat.completions.create(
        model="meta/llama-3.1-8b-instruct",  # Your model
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.2,
        top_p=0.7,
        max_tokens=250,
        stream=False  
    )

    # Extract the message content from the response
    knowledge_content = ""
    for choice in completion.choices:
        knowledge_content += choice.message.content
    
    return knowledge_content


In [4]:
import time

# Fill any missing or NaN values in the 'cleaned_message' column with a default string
df['cleaned_message'] = df['cleaned_message'].fillna('')

# Apply the function to the first 100 emails and time the process
start_time = time.time()

# Subset the first 100 emails and create a copy to avoid SettingWithCopyWarning
df_subset = df.head(100).copy()

# Apply the extract_knowledge function using .loc[] to avoid the warning
df_subset.loc[:, 'extracted_knowledge'] = df_subset['cleaned_message'].apply(extract_knowledge)

end_time = time.time()

# Print the time taken
print(f"Time taken for extracting knowledge from 100 emails: {end_time - start_time} seconds")

# Save the DataFrame to CSV
df_subset.to_csv('../data/emails_with_extracted_knowledge_100_30SEP24_v1.csv', index=False)

Time taken for extracting knowledge from 100 emails: 29.85118007659912 seconds


In [17]:
#Using NIM to determine how long this would take to extract knowledge from all emails:
print(f"Time per email: {time_per_email}")

time_per_email = (end_time - start_time) / 100
total_time_for_df = time_per_email * df.shape[0]

# Convert the total time for df into days and hours
completion = client.chat.completions.create(
    model="meta/llama-3.1-8b-instruct",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": f"convert this number which is in seconds into days and hours: {total_time_for_df}"}
    ],
    temperature=0,
    top_p=0.7,
    max_tokens=250,
    stream=False  
)

# Print the result from the completion
result = completion.choices[0].message.content  # Access content property directly
print(result)

Time per email: 0.2985118007659912
To convert the given number of seconds into days and hours, we need to perform two conversions:

1. Convert seconds to hours
2. Convert hours to days

Here's the calculation:

**Step 1: Convert seconds to hours**

There are 3600 seconds in an hour. To convert seconds to hours, divide the number of seconds by 3600:

154450.30422812462 s ÷ 3600 = 42.9875 hours

**Step 2: Convert hours to days**

There are 24 hours in a day. To convert hours to days, divide the number of hours by 24:

42.9875 hours ÷ 24 = 1.7936 days

So, the equivalent of 154450.30422812462 seconds is approximately:

**1 day, 21 hours, and 55 minutes**

Let me know if you have any further questions!
