In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [12]:
# Load the dataset
data = pd.read_csv('../datasets/email_spam.csv').drop("Unnamed: 0.1",axis=1).drop("Unnamed: 0",axis=1)
data.head()

Unnamed: 0,Body,Label
0,Subject: stock promo mover : cwtd\r\n * * * ur...,1
1,Subject: are you listed in major search engine...,1
2,"Subject: important information thu , 30 jun 20...",1
3,Subject: = ? utf - 8 ? q ? bask your life with...,1
4,"Subject: "" bidstogo "" is places to go , things...",1


In [3]:
print(data.isna().sum())
# Split the dataset into training and testing data
X = data["Body"].values
y = data['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Body     0
Label    0
dtype: int64


In [4]:
# Create a CountVectorizer to convert the text data into a bag-of-words representation
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


In [6]:
random_forest = RandomForestClassifier(n_jobs=-1, random_state=42)
# Train the classifier
random_forest.fit(X_train_vectorized, y_train)

# Predictions on the testing set
y_pred = random_forest.predict(X_test_vectorized)

In [11]:
# Sample email body to test
sample_email_body = ["Get a special discount on our exclusive products today!"]

# Vectorize the sample email body using the same CountVectorizer
sample_email_vectorized = vectorizer.transform(sample_email_body)

# Make a prediction using the trained Random Forest classifier
prediction = random_forest.predict(sample_email_vectorized)

# Display the prediction
if prediction[0] == 'spam':
    print("The email is classified as spam.")
else:
    print("The email is not classified as spam.")


The email is not classified as spam.


In [10]:

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.98


In [13]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

# Cross-validation predictions
y_cv_pred = cross_val_predict(random_forest, X_train_vectorized, y_train, cv=5, n_jobs=-1)

# Confusion matrix
conf_matrix = confusion_matrix(y_train, y_cv_pred)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[3875  137]
 [  62 3926]]


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('../datasets/email_spam.csv').drop(["Unnamed: 0.1", "Unnamed: 0"], axis=1)

# Check for NaN values in the target variable (Label)
nan_target_values = data['Label'].isna().sum()
print(f"Number of NaN values in the target variable (Label): {nan_target_values}")

# Handle missing values in the target variable
data.dropna(subset=['Label'], inplace=True)
# Split the dataset into training and testing data
X = data["Body"].values
y = data['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a CountVectorizer to convert the text data into a bag-of-words representation
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the classifier
random_forest = RandomForestClassifier(n_jobs=-1, random_state=42)
random_forest.fit(X_train_vectorized, y_train)

# Predictions on the testing set
y_pred = random_forest.predict(X_test_vectorized)

# Sample email body to test
sample_email_body = ["Get a special discount on our exclusive products today!"]

# Vectorize the sample email body using the same CountVectorizer
sample_email_vectorized = vectorizer.transform(sample_email_body)

# Make a prediction using the trained Random Forest classifier
prediction = random_forest.predict(sample_email_vectorized)

# Display the prediction
if prediction[0] == 'spam':
    print("The email is classified as spam.")
else:
    print("The email is not classified as spam.")

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Number of NaN values in the target variable (Label): 0
The email is not classified as spam.
Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.98


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
data = pd.read_csv('../datasets/email_spam.csv').drop(["Unnamed: 0.1", "Unnamed: 0"], axis=1)

# Check for NaN values in the target variable (Label)
nan_target_values = data['Label'].isna().sum()
print(f"Number of NaN values in the target variable (Label): {nan_target_values}")

# Handle missing values in the target variable
data.dropna(subset=['Label'], inplace=True)

# Split the dataset into training and testing data
X = data["Body"].values
y = data['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a TF-IDF Vectorizer to convert the text data into a weighted representation
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the classifier
random_forest = RandomForestClassifier(n_jobs=-1, random_state=42)
random_forest.fit(X_train_vectorized, y_train)

# Predictions on the testing set
y_pred = random_forest.predict(X_test_vectorized)

# Sample email body to test
sample_email_body = ["Get a special discount on our exclusive products today!"]

# Vectorize the sample email body using the same TF-IDF Vectorizer
sample_email_vectorized = vectorizer.transform(sample_email_body)

# Make a prediction using the trained Random Forest classifier
prediction = random_forest.predict(sample_email_vectorized)

# Display the prediction
if prediction[0] == 'spam':
    print("The email is classified as spam.")
else:
    print("The email is not classified as spam.")

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Number of NaN values in the target variable (Label): 0
The email is not classified as spam.
Model Evaluation Metrics:
Accuracy: 0.98
Precision: 0.98
Recall: 0.99
F1 Score: 0.98
