In [3]:
# Step 1: Import Libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 2: Download NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 3: Initialize Tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Step 4: Define Preprocessing Functions
def clean_text(text):
    """Clean and normalize input text."""
    text = str(text).lower()                           # Lowercase
    text = re.sub(r'<.*?>', '', text)                   # Remove HTML
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)                # Remove non-letter characters
    text = re.sub(r'\s+', ' ', text).strip()             # Remove extra spaces
    return text

def tokenize_text(text):
    """Tokenize and remove stopwords and lemmatize."""
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

def preprocess_email(text):
    """Complete preprocessing pipeline."""
    text = clean_text(text)
    tokens = tokenize_text(text)
    return ' '.join(tokens)

# Step 5: Main Execution
def main():
    # Load dataset
    input_file = "C:/Users/Dell/Desktop/PROJECTS/MINI-PROJECT/archive1/emails.csv"
    df = pd.read_csv(input_file)
    
    # Clean column names
    df.columns = df.columns.str.strip().str.lower()
    
    # Ensure 'email' and 'label' columns exist
    assert 'text' in df.columns, "Dataset must have 'email' column."
    if 'spam' not in df.columns:
        raise ValueError("Dataset must have 'spam' column (0 for ham, 1 for spam).")
    
    # Handle missing values
    df['email'] = df['text'].fillna('')
    
    # Preprocess emails
    print("Preprocessing emails...")
    df['processed_email'] = df['text'].apply(preprocess_email)
    
    # Remove empty processed emails
    df = df[df['processed_email'].str.strip() != '']
    
    # Feature Extraction
    # -- TF-IDF
    print("Applying TF-IDF Vectorizer...")
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_tfidf = tfidf_vectorizer.fit_transform(df['processed_email']).toarray()

    
    # Labels
    y = df['spam']

    # Train-Test Split
    print("Splitting dataset...")
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

    # Logistic Regression Model
    print("Training Logistic Regression...")
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluation
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy Score: {acc:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

if __name__ == "__main__":
    main()
#preprocessing the data
#feature extraction
#these are done in the above code

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing emails...
Applying TF-IDF Vectorizer...
Splitting dataset...
Training Logistic Regression...
Accuracy Score: 0.9808

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       872
           1       0.98      0.93      0.96       274

    accuracy                           0.98      1146
   macro avg       0.98      0.96      0.97      1146
weighted avg       0.98      0.98      0.98      1146


Confusion Matrix:
 [[868   4]
 [ 18 256]]


In [None]:
# Step 1: Import Libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Blockchain-related imports
from web3 import Web3
from solcx import compile_standard, install_solc

# Step 2: Download NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 3: Initialize Tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Step 4: Connect to Private Blockchain (Ganache / Private Ethereum Node)
w3 = Web3(Web3.HTTPProvider('HTTP://127.0.0.1:7545'))

if w3.is_connected():
    print("✅ Connected to Private Blockchain!")
else:
    raise Exception("❌ Connection to blockchain failed. Start your private Ethereum node.")

# Step 5: Define Preprocessing Functions
def clean_text(text):
    """Clean and normalize input text."""
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_text(text):
    """Tokenize and remove stopwords and lemmatize."""
    words = word_tokenize(text)
    return [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

def preprocess_email(text):
    """Complete preprocessing pipeline."""
    return ' '.join(tokenize_text(clean_text(text)))

# Step 6: Smart Contract Source
contract_source_code = '''
pragma solidity ^0.8.0;

contract EmailSpamDetection {
    mapping(string => uint256) public emailHashes;
    address public owner;
    
    constructor() {
        owner = msg.sender;
    }

    modifier onlyOwner() {
        require(msg.sender == owner, "Only the owner can write to the blockchain.");
        _;
    }

    function storeEmailHash(string memory emailHash, uint256 spamStatus) public onlyOwner {
        emailHashes[emailHash] = spamStatus;
    }

    function getEmailHashStatus(string memory emailHash) public view returns (uint256) {
        return emailHashes[emailHash];
    }
}
'''

# Step 7: Main Program
def main():
    # Load and preprocess dataset
    df = pd.read_csv("C:/Users/Dell/Desktop/PROJECTS/MINI-PROJECT/archive1/emails.csv")
    df.columns = df.columns.str.strip().str.lower()
    assert 'text' in df.columns, "Dataset must have 'text' column."
    assert 'spam' in df.columns, "Dataset must have 'spam' column."

    df['text'] = df['text'].fillna('')
    df['processed_email'] = df['text'].apply(preprocess_email)
    df = df[df['processed_email'].str.strip() != '']

    # Feature Extraction
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X = tfidf_vectorizer.fit_transform(df['processed_email']).toarray()
    y = df['spam']

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Train Logistic Regression
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(f"✅ Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # Step 8: Blockchain - Deploy and Interact
    print("\n🚀 Deploying Smart Contract...")

    install_solc('0.8.0')  # Ensure correct Solidity version is installed

    compiled_sol = compile_standard({
        "language": "Solidity",
        "sources": {
            "EmailSpamDetection.sol": {
                "content": contract_source_code
            }
        },
        "settings": {
            "outputSelection": {
                "*": {
                    "*": ["abi", "evm.bytecode"]
                }
            }
        }
    }, solc_version='0.8.0')

    abi = compiled_sol['contracts']['EmailSpamDetection.sol']['EmailSpamDetection']['abi']
    bytecode = compiled_sol['contracts']['EmailSpamDetection.sol']['EmailSpamDetection']['evm']['bytecode']['object']

    # Deploy Contract
    account = w3.eth.accounts[0]
    EmailSpamContract = w3.eth.contract(abi=abi, bytecode=bytecode)
    tx_hash = EmailSpamContract.constructor().transact({'from': account})
    tx_receipt = w3.eth.wait_for_transaction_receipt(tx_hash)

    contract = w3.eth.contract(address=tx_receipt.contractAddress, abi=abi)

    print(f"✅ Smart Contract deployed at: {tx_receipt.contractAddress}")

    # Store email hash
    print("\n🔒 Storing email hashes on Blockchain...")
    for idx, row in df.iterrows():
        email_hash = w3.keccak(text=row['processed_email']).hex()
        spam_status = int(row['spam'])

        tx = contract.functions.storeEmailHash(email_hash, spam_status).transact({'from': account})
        w3.eth.wait_for_transaction_receipt(tx)

        print(f"Stored {email_hash} => {spam_status}")

    print("\n🎉 All data stored securely on Private Blockchain!")

if __name__ == "__main__":
    main()





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Connected to Private Blockchain!
✅ Accuracy: 0.9808

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       872
           1       0.98      0.93      0.96       274

    accuracy                           0.98      1146
   macro avg       0.98      0.96      0.97      1146
weighted avg       0.98      0.98      0.98      1146


Confusion Matrix:
 [[868   4]
 [ 18 256]]

🚀 Deploying Smart Contract...
✅ Smart Contract deployed at: 0xf720b582531b2A81c87CC3564B8F1C39A564Fb23

🔒 Storing email hashes on Blockchain...
Stored 1a9c1c7223a715e494c771f8304f90022eac313fda5f0a111cb5db5ca317b393 => 1
Stored 92f79794d0ccaa70f7e807153d82710619e15cc44f4d9a215631432525c7ef71 => 1
Stored 68a6e01be077fb7d37186ddb1ee8b8f341e0e7735fe50e8f6d454e8b0f27069f => 1
Stored 1c56c10f8c951ea5ee2ad56a97262e20b21eaf4bc4bb10c27976af88883cc443 => 1
Stored 903fd9d95d515c8301aba7d0b0f660140c8b2597a504719ac27592ef247ebdca => 1
Stored 0c089c473c178f