In [6]:
# Step 1: Import Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 2: Load the Cleaned Data
df = pd.read_csv("C:/Users/Dell/Desktop/PROJECTS/Email-spam-Detection-Using-Blockchain/tokenized_cleaned_data.csv")  # Ensure this is the path to your cleaned data CSV

# Ensure required columns exist
assert 'text' in df.columns and 'spam' in df.columns, "Dataset must contain 'processed_email' and 'spam' columns."

# Step 3: Feature Extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['text']).toarray()  # TF-IDF Features
y = df['spam']  # Target Labels (spam: 1, not spam: 0)

# Step 4: Train-Test Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 5: Train Logistic Regression Model
model = LogisticRegression(max_iter=1000)  # Logistic Regression
model.fit(X_train, y_train)  # Fit model to training data

# Step 6: Predict and Evaluate Model on Test Data
y_pred = model.predict(X_test)  # Predictions on the test set

# Step 7: Evaluation Metrics
print(f"✅ Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.9834

📋 Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       872
           1       0.99      0.94      0.96       274

    accuracy                           0.98      1146
   macro avg       0.99      0.97      0.98      1146
weighted avg       0.98      0.98      0.98      1146


🧮 Confusion Matrix:
 [[869   3]
 [ 16 258]]
