In [None]:
# Import libraries for exploratory data analysis (EDA)
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# Import libraries for word cloud
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from wordcloud import WordCloud

# Import UI libraries
from IPython.display import display, clear_output
import ipywidgets as widgets

# Import libraries for model training
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
# Download nltk data
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# Load email dataset
email_df = pd.read_csv('https://raw.githubusercontent.com/george-catobus/expert-engine/codespace-expert-engine-qprj46q4qpw3r5/notebooks/email_data.csv', index_col=0)

In [None]:
# Display the first five rows of the data set
email_df.head()

In [None]:
# Display the shape of the dataset (rows and columns)
email_df.shape

In [None]:
# Display a breakdown of the data and types
email_df.info()

In [None]:
# Remove any duplicates
email_df.drop_duplicates()

# Count the number of NULL values in every column
email_df.isnull().sum()

In [None]:
# Remove columns with limited data and/or are not useful
email_df = email_df.drop('Sender', axis=1)
email_df = email_df.drop('Receiver', axis=1)
email_df = email_df.drop('URLS', axis=1)
email_df = email_df.drop('Date', axis=1)

# Remove all rows that contain null values
email_df = email_df.dropna(how='any',axis=0) 

# Display statistical details for the dataset
email_df.describe(include="all")

In [None]:
# Count the occurrences of each email type
email_type_counts = email_df['Label'].value_counts()

# Define the bar chart for distribution counts
bar_fig, bar_ax = plt.subplots()
bar_ax.set_title('Total distribution of Legitimate, Spam, and Phishing Emails')
bar_ax.bar(email_type_counts.index, email_type_counts.values, color=['#ff9999','#66b3ff','#99ff99'])
bar_ax.set_xlabel('Email Type')
bar_ax.set_ylabel('Count')

In [None]:
# Define the pie chart for distribution percentage
pie_fig, pie_ax = plt.subplots()
pie_ax.set_title('% Distribution of Legitimate, Spam, and Phishing Emails')
pie_ax.pie(email_type_counts, labels=email_type_counts.index
           , autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99'])
pie_ax.axis('equal')

In [None]:
# Define the character mapping table outside the function to avoid re-creation
removal_string = string.punctuation + '\r' + '\n'
character_mapping = str.maketrans('', '', removal_string)

# Define a function to strip punctuation and newlines from the email text
def clean_data(text):
    # Remove the punctuation
    return text.translate(character_mapping)

# Clean the data using vectorized string operations
email_df['Message'] = email_df['Message'].astype(str).str.translate(character_mapping)

In [None]:
# Initialize the stopwords set and the Snowball stemmer once
stopwords_set = set(stopwords.words('english'))
ss = SnowballStemmer("english")

# Define a function to remove stopwords and stem the content
def clean_stopwords(text):
    # Convert the text to lowercase, split into words, remove stopwords, and stem
    return ' '.join(ss.stem(word) for word in text.lower().split() if word not in stopwords_set)

# Clean the data
email_df['Message'] = email_df['Message'].astype(str).apply(clean_stopwords)

In [None]:
# Filter the data for each word cloud
legit_filter_data = email_df[email_df['Label'] == 'Legitimate']

# Prepare the data for each word cloud
legit_data = " ".join(legit_filter_data['Message'])

# Create each word cloud
legit_wc = WordCloud(background_color='black', max_words=100, width=800, height=400, collocations=False).generate(legit_data)

# Define the legitimate word cloud
legit_fig, legit_ax = plt.subplots(figsize=(7, 7))
legit_ax.imshow(legit_wc, interpolation='bilinear')
legit_ax.set_title(f'WordCloud for Legitimate emails', fontsize=15)
legit_ax.axis('off')

In [None]:
# Filter the data for each word cloud
spam_filter_data = email_df[email_df['Label'] == 'Spam']

# Prepare the data for each word cloud
spam_data = " ".join(spam_filter_data['Message'])

# Create each word cloud
spam_wc = WordCloud(background_color='black', max_words=100, width=800, height=400, collocations=False).generate(spam_data)

# Define the legitimate word cloud
spam_fig, spam_ax = plt.subplots(figsize=(7, 7))
spam_ax.imshow(spam_wc, interpolation='bilinear')
spam_ax.set_title(f'WordCloud for Spam emails', fontsize=15)
spam_ax.axis('off')

In [None]:
# Filter the data for each word cloud
phish_filter_data = email_df[email_df['Label'] == 'Phishing']

# Prepare the data for each word cloud
phish_data = " ".join(phish_filter_data['Message'])

# Create each word cloud
phish_wc = WordCloud(background_color='black', max_words=100, width=800, height=400, collocations=False).generate(phish_data)

# Define the phishing word cloud
phish_fig, phish_ax = plt.subplots(figsize=(7, 7))
phish_ax.imshow(phish_wc, interpolation='bilinear')
phish_ax.set_title(f'WordCloud for Phishing emails', fontsize=15)
phish_ax.axis('off')

In [None]:
# Create training and testing sets
x_train, x_test, y_train, y_test = train_test_split(email_df['Message'], email_df['Label'], test_size = 0.25, random_state = 68)

# Naive Bayes offers three types of classifiers: Bernoulli, Multinomial, and Gaussian.
# This implementation uses Multinomial Naive Bayes because the data is in discrete form.
clf=Pipeline([('vectorizer',CountVectorizer()), ('nb',MultinomialNB())])

# Train the model
clf.fit(x_train,y_train)

In [None]:
# Display the dimensions of the data
print('x_train:', x_train.shape)
print('x_test:', x_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)

In [None]:
# Predict the labels for the test set
y_pred = clf.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2%}')

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')
print(f'Precision: {precision:.2%}')

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')
print(f'Recall: {recall:.2%}')

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 Score: {f1:.2%}')

In [None]:
# Display the confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

# Show the plot
plt.show()

In [None]:
# Define a function to tokenize and count words with error handling
def count_words(text):
    try:
        tokens = word_tokenize(text)
        return len(tokens)
    except Exception as e:
        print(f"Error tokenizing text: {e}")
        return 0

email_df['num_words'] = email_df['Message'].apply(count_words)

# Create a figure and set the figure size
plt.figure(figsize=(10, 6))

# Plot the histogram for legitimate emails in green
sns.histplot(email_df[email_df['Label'] == 'Legitimate']['num_words'], color='green', label='Legitimate', kde=True)

# Plot the histogram for spam emails in blue
sns.histplot(email_df[email_df['Label'] == 'Spam']['num_words'], color='blue', label='Spam', kde=True)

# Plot the histogram for phishing emails in red
sns.histplot(email_df[email_df['Label'] == 'Phishing']['num_words'], color='red', label='Phishing', kde=True)

# Add labels and a title
plt.xlabel('Number of Words', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.title('Distribution of Number of Words by Email Type', fontsize=16, fontweight='bold')

# Add a legend
plt.legend()

# Customize the appearance of the plot
sns.set(style='whitegrid')  # Add a white grid background

# Show the plot
plt.show()

In [None]:
################ Start: Create UI Widgets ################

# Create label widgets
email_label = widgets.Label(value='Compose Email:')
display_label = widgets.Label(value='\nPredictions:')

# Create textarea widget for email input
email_input = widgets.Textarea(
    value='',
    placeholder='Compose a new email to predict if it''s legitimate, spam, or phishing...',
    disabled=False,
    layout=widgets.Layout(width='75%', height='100px')
)

# Create output widget to display the list of emails and the prediction
email_display = widgets.Textarea(
    value='',
    placeholder='Email predictions will appear here...',
    disabled=True,
    layout=widgets.Layout(width='75%', height='300px')
)

################ End: Create UI Widgets ################

In [None]:
################ Start: Define function handlers ################

# Define a list to store email addresses
email_list = []

# Define the chart outputs
output = widgets.Output()

# Define the send email function
def send_email(b):
    # Get the composed email
    email = email_input.value

    # Determine if a value exists
    if email:
        # Predict if the email is a legitimate, spam, or phishing email
        results = clf.predict([email])

        # Add the email to the list of emails
        email_list.append('The email, "' + email + '" is ' + results[0] + '.')
        
        # Display the results
        email_display.value = '\n'.join(email_list)

    # Clear the sent email text
    email_input.value = ''

    # Clear the output
    clear_output(wait=True)

# Define the clear list function
def clear_list(b):
    # Reset the email list
    email_list.clear()
    
    # Display the results
    email_display.value = ''

    # Clear the output
    clear_output(wait=True)

# Define the display graphic function
def display_graphic(b, figure):
    # Display the graphic
    output.clear_output()
    with output:
        display(figure)

# Define the display accuracy function
def display_accuracy(b):
    # Get the accuracy score
    percentage = clf.score(x_test,y_test)
    score = f'Accuracy: {percentage:.2%}'

    # Display the graphic
    output.clear_output()
    with output:
        display(score)
    
################ End: Define function handlers ################

In [None]:
################ Start: Create UI Buttons ################

# Create button to send email
send_email_button = widgets.Button(
    description='Send Email',
    button_style='success',
    tooltip='Send email and predict',
)

# Create button to clear the list
clear_list_button = widgets.Button(
    description='Clear Predictions',
    button_style='danger',
    tooltip='Clear the list of predictions',
)

# Create button to display counts
display_counts_button = widgets.Button(
    description='Distribution #',
    button_style='info',
    tooltip='Distribution of email types by Count',
)

# Create button to display distribution
display_distribution_button = widgets.Button(
    description='Distribution %',
    button_style='info',
    tooltip='Distribution of email types by Percent',
)

# Create button to display legit word cloud
display_legit_button = widgets.Button(
    description='Legit Word Cloud',
    tooltip='Word cloud for legitimate emails',
)

# Create button to display spam word cloud
display_spam_button = widgets.Button(
    description='Spam Word Cloud',
    tooltip='Word cloud for spam emails',
)

# Create button to display phishing word cloud
display_phish_button = widgets.Button(
    description='Phishing Word Cloud',
    tooltip='Word cloud for phishing emails',
)

# Create button to display accuracy
display_accuracy_button = widgets.Button(
    description='Prediction Accuracy',
    button_style='warning',
    tooltip='Show the prediction accuracy for the test dataset',
)

# Set button click event handlers
send_email_button.on_click(send_email)
clear_list_button.on_click(clear_list)
display_counts_button.on_click(lambda b: display_graphic(b, bar_fig))
display_distribution_button.on_click(lambda b: display_graphic(b, pie_fig))
display_legit_button.on_click(lambda b: display_graphic(b, legit_fig))
display_spam_button.on_click(lambda b: display_graphic(b, spam_fig))
display_phish_button.on_click(lambda b: display_graphic(b, phish_fig))
display_accuracy_button.on_click(display_accuracy)

################ End: Create UI Buttons ################

In [None]:
################ Start: UI ################

# Organize the action buttons in a horizontal box
action_buttons = widgets.HBox([clear_list_button, display_counts_button, display_distribution_button, display_legit_button, display_spam_button, display_phish_button, display_accuracy_button])

# Organize widgets in a vertical box
ui_elements = widgets.VBox([email_label, email_input, send_email_button, display_label, email_display, action_buttons, output])

# Display the UI
display(ui_elements)

################ End: UI ################