<h1>SMS Detection</h1>

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
from bs4 import BeautifulSoup
import ipaddress
from dateutil.parser import parse as date_parse
from sklearn import metrics 
import warnings
import requests
from urllib.parse import urlparse, unquote, urljoin
import re
import socket
from transformers import BertTokenizer, BertModel
import torch
import io

In [42]:

# Load the dataset
data = pd.read_csv('./SMS.csv')

# Map labels to numerical values
data['LABEL'] = data['LABEL'].map({'Smishing': 1, 'ham': 0})
data.dropna(subset=['LABEL'], inplace=True)
data.reset_index(drop=True, inplace=True)
# Prepare data for training
X = data['TEXT']
y = data['LABEL']
print(y.unique())

# Tokenize text
max_words = 10000  # Define the maximum number of words to keep
max_length = 200  # Define the sequence length
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to fixed length
X_padded = pad_sequences(X_sequences, maxlen=max_length)

# Convert labels to categorical
y_categorical = to_categorical(y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)

# Define the CNN model
embedding_dim = 100
filters = 128
kernel_size = 5

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length))
model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(2, activation='softmax'))  # Two classes: 'Smishing' and 'ham'

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))


[0. 1.]
Epoch 1/5


2025-04-06 18:53:09.999222: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x35de1b410>

In [43]:
import numpy as np
import pickle
#Save the trained model
model.save('./sms_model5.h5')

In [44]:
# Custom message for classification
# Load the saved model
from keras.models import load_model
loaded_model = load_model('./sms_model5.h5')
new_message = "Its been so long since we talked. I miss you so much. Can you call me today?"

# Tokenize and pad the new message
new_message_sequence = tokenizer.texts_to_sequences([new_message])
new_message_padded = pad_sequences(new_message_sequence, maxlen=max_length)

# Classify the new message
prediction = loaded_model.predict(new_message_padded)
print(prediction)
predicted_label = np.argmax(prediction)
print(predicted_label)
# Decode the predicted label
label_mapping = {0: 'ham', 1: 'Smishing'}
predicted_class = label_mapping[predicted_label]

print(f"The model classifies the message as: '{predicted_class}'")

[[9.9998188e-01 1.8173008e-05]]
0
The model classifies the message as: 'ham'


In [45]:
new_message = "No Credit Score? No income proof? No Problem. Use your FD & get your HDFC Bank Credit Card today: https://hdfcbk.io/HDFCBK/s/dpwgW2YL T&C"
new_message_sequence = tokenizer.texts_to_sequences([new_message])
new_message_padded = pad_sequences(new_message_sequence, maxlen=max_length)

# Classify the new message
prediction = loaded_model.predict(new_message_padded)
print(prediction)
predicted_label = np.argmax(prediction)
print(predicted_label)
# Decode the predicted label
label_mapping = {0: 'ham', 1: 'Smishing'}
predicted_class = label_mapping[predicted_label]

print(f"The model classifies the message as: '{predicted_class}'")

[[0.00297017 0.9970298 ]]
1
The model classifies the message as: 'Smishing'


<h1>Website content</h1>

In [1]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import tldextract
from urllib.parse import urlparse

# Load zero-shot classifier (using Hugging Face Transformers)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [2]:

def extract_website_content(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        title = soup.title.string.strip() if soup.title else "No Title Found"

        # Extract visible text from body
        paragraphs = soup.find_all("p")
        body_text = ' '.join(p.get_text().strip() for p in paragraphs)
        body_text = body_text[:2000]  # Limit to first 2000 characters for performance

        return title, body_text, response.url
    except Exception as e:
        print(f"Failed to extract from {url}: {e}")
        return "", "", url


In [3]:

def is_title_related_to_body(title, body):
    if not title or not body:
        return False

    # Zero-shot classification to assess semantic relevance
    hypothesis_template = f"The webpage body content is related to the title: '{title}'"
    result = classifier(body, candidate_labels=["related", "not related"], hypothesis_template=hypothesis_template)

    # Return True if model thinks they are related
    return result["labels"][0] == "related" and result["scores"][0] > 0.7

def extract_features(url, title, body):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc

    features = {
        "https": url.startswith("https"),
        "has_ip_address": any(char.isdigit() for char in domain),
        "suspicious_words": any(word in body.lower() for word in ["login", "verify", "click here", "account suspended"]),
        "short_title": len(title) < 5,
        "low_title_body_similarity": not is_title_related_to_body(title, body)
    }

    return features


In [4]:

def is_phishing_site(features):
    # Rule-based decision making (can be replaced by ML model)
    score = 0
    score += 1 if not features["https"] else 0
    score += 1 if features["has_ip_address"] else 0
    score += 1 if features["suspicious_words"] else 0
    score += 1 if features["short_title"] else 0
    score += 1 if features["low_title_body_similarity"] else 0

    return score >= 3  # Threshold can be tuned


In [5]:

def analyze_url(url):
    print(f"\n🔗 Analyzing: {url}")
    title, body, final_url = extract_website_content(url)
    features = extract_features(final_url, title, body)
    is_phishing = is_phishing_site(features)

    print(f"\n📝 Title: {title}")
    print(f"\n📄 Body Sample: {body[:300]}...\n")
    print("📊 Features Extracted:")
    for k, v in features.items():
        print(f"  - {k}: {v}")

    result = "🚨 Phishing Site Detected!" if is_phishing else "✅ Legitimate Website."
    print(f"\n🔍 Result: {result}")
    return is_phishing


In [6]:

# Example
url = "https://www.google.com/"
analyze_url(url)



🔗 Analyzing: https://www.google.com/
Failed to extract from https://www.google.com/: HTTPSConnectionPool(host='www.google.com', port=443): Read timed out. (read timeout=10)

📝 Title: 

📄 Body Sample: ...

📊 Features Extracted:
  - https: True
  - has_ip_address: False
  - suspicious_words: False
  - short_title: True
  - low_title_body_similarity: True

🔍 Result: ✅ Legitimate Website.


False

In [7]:

# Example
url = "https://www.amazon.in/"
analyze_url(url)



🔗 Analyzing: https://www.amazon.in/
Failed to extract from https://www.amazon.in/: HTTPSConnectionPool(host='www.amazon.in', port=443): Read timed out. (read timeout=10)

📝 Title: 

📄 Body Sample: ...

📊 Features Extracted:
  - https: True
  - has_ip_address: False
  - suspicious_words: False
  - short_title: True
  - low_title_body_similarity: True

🔍 Result: ✅ Legitimate Website.


False

In [10]:

# Example
url = "https://www.cryptonitemit.in/"
analyze_url(url)



🔗 Analyzing: https://www.cryptonitemit.in/


ValueError: The provided hypothesis_template "The webpage body content is related to the title: 'Cryptonite - Official Cybersecurity Student Project of MIT Manipal'" was not able to be formatted with the target labels. Make sure the passed template includes formatting syntax such as {} where the label should go.

In [6]:
import requests
from bs4 import BeautifulSoup
from transformers import BertTokenizer, BertModel
import torch
from scipy.spatial.distance import cosine

local_path = "./local-models/bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(local_path, local_files_only=True)
model = BertModel.from_pretrained(local_path, local_files_only=True)

def analyze_website_similarity(domain, threshold=0.6):
    def extract_website_content(url):
        if not url.startswith('http'):
            url = 'https://' + url
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                return response.text
            else:
                return None
        except requests.RequestException:
            return None

    def extract_title_and_body(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        title = soup.title.text.strip() if soup.title else "No title found"
        body = soup.body.get_text(separator=' ', strip=True) if soup.body else "No body found"
        return title, body

    def get_bert_embedding(text):
        tokens = tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(tokens)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
        return embeddings

    def check_similarity(title, body):
        title_embedding = get_bert_embedding(title)
        body_embedding = get_bert_embedding(body)
        similarity_score = 1 - cosine(title_embedding, body_embedding)
        return similarity_score

    content = extract_website_content(domain)
    if content:
        title, body = extract_title_and_body(content)

        if title == "No title found" and body == "No body found":
            return {"phishing_probability": 1.0, "message": "Title and body are empty."}
        elif title == "No title found":
            return {"phishing_probability": 1.0, "message": "Title is empty."}
        elif body == "No body found":
            return {"phishing_probability": 1.0, "message": "Body is empty."}

        similarity = check_similarity(title, body)
        phish_prob = 1 - similarity

        message = "Domain is unsafe" if phish_prob >= threshold else "Domain is safe"
        return {"phishing_probability": phish_prob, "message": message}
    else:
        return {"phishing_probability": 1.0, "message": "Failed to extract website content."}


In [7]:
analyze_website_similarity("https://www.cryptonitemit.in/")

{'phishing_probability': 0.2723313570022583, 'message': 'Domain is safe'}

In [18]:
analyze_website_similarity("https://www.irctc.co.in/nget/train-search")

{'phishing_probability': 1.0, 'message': 'Failed to extract website content.'}

In [15]:
analyze_website_similarity("https://www.fisglobal.com/")

{'phishing_probability': 0.3969181180000305, 'message': 'Domain is safe'}

In [21]:
import requests
from bs4 import BeautifulSoup
from transformers import BertTokenizer, BertModel
import torch
from scipy.spatial.distance import cosine
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

def analyze_website_similarity(domain, threshold=0.6):
    local_path = "./local-models/bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(local_path, local_files_only=True)
    model = BertModel.from_pretrained(local_path, local_files_only=True)

    def extract_website_content_with_selenium(url):
        if not url.startswith('http'):
            url = 'https://' + url

        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        try:
            driver = webdriver.Chrome(options=chrome_options)
            driver.set_page_load_timeout(30)
            driver.get(url)
            time.sleep(5)  # Allow JS to load
            html = driver.page_source
            driver.quit()
            return html
        except Exception as e:
            return f"Error fetching page with Selenium: {e}"

    def extract_title_and_body(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        title = soup.title.text if soup.title else "No title found"
        body = soup.body.get_text(separator=' ', strip=True) if soup.body else "No body found"
        return title, body

    def get_bert_embedding(text):
        tokens = tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(tokens)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
        return embeddings

    def check_similarity(title, body):
        title_embedding = get_bert_embedding(title)
        body_embedding = get_bert_embedding(body)
        similarity_score = 1 - cosine(title_embedding, body_embedding)
        return similarity_score

    html = extract_website_content_with_selenium(domain)
    if isinstance(html, str) and html.startswith("Error"):
        return {"phishing_probability": 1, "message": html}

    title, body = extract_title_and_body(html)

    if title.strip() == "No title found" and body.strip() == "No body found":
        return {"phishing_probability": 1, "message": "Title and body are empty."}
    elif title.strip() == "No title found":
        return {"phishing_probability": 1, "message": "Title is empty."}
    elif body.strip() == "No body found":
        return {"phishing_probability": 1, "message": "Body is empty."}

    similarity = check_similarity(title, body)
    phish_prob = 1 - similarity
    message = "Domain is unsafe" if phish_prob >= threshold else "Domain is safe"
    
    return {
        "phishing_probability": round(phish_prob, 3),
        "similarity_score": round(similarity, 3),
        "title": title,
        "message": message
    }




In [23]:
# Example usage
if __name__ == "__main__":
    url = "https://www.irctc.co.in/nget/train-search"
    result = analyze_website_similarity(url)
    print(result)

{'phishing_probability': 0.324, 'similarity_score': 0.676, 'title': 'www.irctc.co.in', 'message': 'Domain is safe'}


In [24]:
# Example usage
if __name__ == "__main__":
    url = "google.com"
    result = analyze_website_similarity(url)
    print(result)

{'phishing_probability': 0.745, 'similarity_score': 0.255, 'title': 'Google', 'message': 'Domain is unsafe'}


In [25]:
# Example usage
if __name__ == "__main__":
    url = "amazon.in"
    result = analyze_website_similarity(url)
    print(result)

{'phishing_probability': 0.192, 'similarity_score': 0.808, 'title': 'Online Shopping site in India: Shop Online for Mobiles, Books, Watches, Shoes and More - Amazon.in', 'message': 'Domain is safe'}


<h1>OCR Functionality</h1>