##### safe

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import nltk
import re
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# 📌 download NLTK resources if you haven’t already
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# 🔷 Load model & tokenizer
model = load_model("CS_model.h5")
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# 🔷 Config
max_len = 50
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

cs_keywords = [
    'cyber', 'hack', 'breach', 'malware', 'phishing', 'ransomware',
    'infrastructure', 'ddos', 'security', 'data', 'attack', 'vulnerability',
    'privacy', 'leak', 'spyware', 'exploit', 'threat', 'database'
]

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)

# 🔷 Input websites
print("🌐 Enter websites to scrape, separated by commas:")
user_input = input("👉 ").strip()
sites = [url.strip() for url in user_input.split(",") if url.strip()]

if not sites:
    print("⚠️ No websites entered. Exiting.")
    exit()

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

for url in sites:
    print(f"\n📡 Scraping: {url}")
    try:
        driver.get(url)
        driver.implicitly_wait(5)
        html = driver.page_source

        soup = BeautifulSoup(html, "html.parser")
        all_headlines = [
            h.text.strip()
            for tag in ["h1", "h2", "h3", "h4", "h5", "h6"]
            for h in soup.find_all(tag)
        ]

        # 🔷 Filter CS-related only
        headlines = [h for h in all_headlines if any(kw in h.lower() for kw in cs_keywords)]

        if not headlines:
            print("⚠️ No CS-related headlines found.")
        else:
            print(f"✅ Found {len(headlines)} CS-related headlines.\n")

            for text in headlines:
                cleaned = clean_text(text)
                seq = tokenizer.texts_to_sequences([cleaned])
                padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
                pred = model.predict(padded, verbose=0)
                label = 'Real ✅' if int(pred[0][0] <= 0.5) else 'Fake 🚨'
                print(f"📰 {text}")
                print(f"   → Predicted: {label}\n")
    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")

driver.quit()




🌐 Enter websites to scrape, separated by commas:


👉  https://thehackernews.com, https://krebsonsecurity.com



📡 Scraping: https://thehackernews.com
✅ Found 12 CS-related headlines.

📰 The Hacker News | #1 Trusted Source for Cybersecurity News
   → Predicted: Fake 🚨

📰 China's Massistant Tool Secretly Extracts SMS, GPS Data, and Images From Confiscated Phones
   → Predicted: Fake 🚨

📰 New Webinar: Identity Attacks Have Changed — Have Your IR Playbooks?
   → Predicted: Fake 🚨

📰 Ivanti Zero-Days Exploited to Drop MDifyLoader and Launch In-Memory Cobalt Strike Attacks
   → Predicted: Fake 🚨

📰 CERT-UA Discovers LAMEHUG Malware Linked to APT28, Using LLM for Phishing Campaign
   → Predicted: Fake 🚨

📰 From Backup to Cyber Resilience: Why IT Leaders Must Rethink Backup in the Age of Ransomware
   → Predicted: Real ✅

📰 Hackers Use GitHub Repositories to Host Amadey Malware and Data Stealers, Bypassing Filters
   → Predicted: Fake 🚨

📰 Hackers Exploit Apache HTTP Server Flaw to Deploy Linuxsys Cryptocurrency Miner
   → Predicted: Fake 🚨

📰 Europol Disrupts NoName057(16) Hacktivist Group Linked to D

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import nltk
import re
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# 📌 download NLTK resources if you haven’t already
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# 🔷 Load model & tokenizer
model = load_model("CS_model.h5")
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# 🔷 Config
max_len = 50
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

cs_keywords = [
    'cyber', 'hack', 'breach', 'malware', 'phishing', 'ransomware',
    'infrastructure', 'ddos', 'security', 'data', 'attack', 'vulnerability',
    'privacy', 'leak', 'spyware', 'exploit', 'threat', 'database'
]

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)

# 🔷 Input websites
print("🌐 Enter websites to scrape, separated by commas:")
user_input = input("👉 ").strip()
sites = [url.strip() for url in user_input.split(",") if url.strip()]

if not sites:
    print("⚠️ No websites entered. Exiting.")
    exit()

# 🔷 Headless Chrome with human-like options
options = Options()
options.add_argument("--headless=new")  # or "--headless"
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

for url in sites:
    print(f"\n📡 Scraping: {url}")
    try:
        driver.get(url)
        driver.implicitly_wait(5)
        html = driver.page_source

        soup = BeautifulSoup(html, "html.parser")
        all_headlines = [
            h.text.strip()
            for tag in ["h1", "h2", "h3", "h4", "h5", "h6"]
            for h in soup.find_all(tag)
        ]

        # 🔷 Filter CS-related only
        headlines = [h for h in all_headlines if any(kw in h.lower() for kw in cs_keywords)]

        if not headlines:
            print("⚠️ No CS-related headlines found.")
        else:
            print(f"✅ Found {len(headlines)} CS-related headlines.\n")

            for text in headlines:
                cleaned = clean_text(text)
                seq = tokenizer.texts_to_sequences([cleaned])
                padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
                pred = model.predict(padded, verbose=0)
                label = 'Real ✅' if int(pred[0][0] <= 0.5) else 'Fake 🚨'
                print(f"📰 {text}")
                print(f"   → Predicted: {label}\n")
    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")

driver.quit()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


🌐 Enter websites to scrape, separated by commas:


👉  https://thehackernews.com, https://krebsonsecurity.com



📡 Scraping: https://thehackernews.com
✅ Found 12 CS-related headlines.

📰 The Hacker News | #1 Trusted Source for Cybersecurity News
   → Predicted: Fake 🚨

📰 China's Massistant Tool Secretly Extracts SMS, GPS Data, and Images From Confiscated Phones
   → Predicted: Fake 🚨

📰 New Webinar: Identity Attacks Have Changed — Have Your IR Playbooks?
   → Predicted: Fake 🚨

📰 Ivanti Zero-Days Exploited to Drop MDifyLoader and Launch In-Memory Cobalt Strike Attacks
   → Predicted: Fake 🚨

📰 CERT-UA Discovers LAMEHUG Malware Linked to APT28, Using LLM for Phishing Campaign
   → Predicted: Fake 🚨

📰 From Backup to Cyber Resilience: Why IT Leaders Must Rethink Backup in the Age of Ransomware
   → Predicted: Real ✅

📰 Hackers Use GitHub Repositories to Host Amadey Malware and Data Stealers, Bypassing Filters
   → Predicted: Fake 🚨

📰 Hackers Exploit Apache HTTP Server Flaw to Deploy Linuxsys Cryptocurrency Miner
   → Predicted: Fake 🚨

📰 Europol Disrupts NoName057(16) Hacktivist Group Linked to D

##### 2

In [14]:
def cs(user_input):
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.chrome.options import Options
    from webdriver_manager.chrome import ChromeDriverManager
    from bs4 import BeautifulSoup
    import nltk
    import re
    import pickle
    from tensorflow.keras.models import load_model
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    
    
    # 📌 download NLTK resources if you haven’t already
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    
    
    # 🔷 Load model & tokenizer
    model = load_model("CS_model.h5")
    with open("tokenizer.pkl", "rb") as f:
        tokenizer = pickle.load(f)
    
    # 🔷 Config
    max_len = 50
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    cs_keywords = [
        'cyber', 'hack', 'breach', 'malware', 'phishing', 'ransomware',
        'infrastructure', 'ddos', 'security', 'data', 'attack', 'vulnerability',
        'privacy', 'leak', 'spyware', 'exploit', 'threat', 'database'
    ]
    
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-z\s]', ' ', text)
        tokens = nltk.word_tokenize(text)
        tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
        return ' '.join(tokens)
    
    # 🔷 Input websites
    # user_input = input("👉 ").strip()
    sites = [url.strip() for url in user_input.split(",") if url.strip()]
    
    if not sites:
        print("⚠️ No websites entered. Exiting.")
        exit()
    
    # 🔷 Headless Chrome with human-like options
    options = Options()
    options.add_argument("--headless=new")  # or "--headless"
    options.add_argument("start-maximized")
    options.add_argument("disable-infobars")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    for url in sites:
        print(f"\n📡 Scraping: {url}")
        try:
            driver.get(url)
            driver.implicitly_wait(5)
            html = driver.page_source
    
            soup = BeautifulSoup(html, "html.parser")
            all_headlines = [
                h.text.strip()
                for tag in ["h1", "h2", "h3", "h4", "h5", "h6"]
                for h in soup.find_all(tag)
            ]
    
            # 🔷 Filter CS-related only
            headlines = [h for h in all_headlines if any(kw in h.lower() for kw in cs_keywords)]
    
            if not headlines:
                print("⚠️ No CS-related headlines found.")
            else:
                print(f"✅ Found {len(headlines)} CS-related headlines.\n")
    
                for text in headlines:
                    cleaned = clean_text(text)
                    seq = tokenizer.texts_to_sequences([cleaned])
                    padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
                    pred = model.predict(padded, verbose=0)
                    label = 'Real ✅' if int(pred[0][0] <= 0.5) else 'Fake 🚨'
                    print(f"📰 {text}")
                    print(f"   → Predicted: {label}\n")
        except Exception as e:
            print(f"❌ Error scraping {url}: {e}")
    
    driver.quit()


In [15]:
cs("https://www.cyberscoop.com/, https://krebsonsecurity.com/")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



📡 Scraping: https://www.cyberscoop.com/
✅ Found 32 CS-related headlines.

📰 Featured on CyberScoop
   → Predicted: Real ✅

📰 United Natural Foods loses up to $400M in sales after cyberattack
   → Predicted: Real ✅

📰 UK sanctions Russian hackers, spies as US weighs its own punishments for Russia
   → Predicted: Real ✅

📰 Senate Democrats seek answers on Trump overhaul of immigrant database to find noncitizen voters
   → Predicted: Real ✅

📰 Ryuk ransomware operator extradited to US, faces five years in federal prison
   → Predicted: Real ✅

📰 House hearing will use Stuxnet to search for novel ways to confront OT cyberthreats
   → Predicted: Real ✅

📰 SonicWall customers hit by fresh, ongoing attacks targeting fully patched SMA 100 devices
   → Predicted: Real ✅

📰 Pro-Russian DDoS group NoName057(16) disrupted by international law enforcement operation
   → Predicted: Real ✅

📰 Former Army soldier pleads guilty to widespread attack spree linked to AT&T, Snowflake and others
   → Predi

In [6]:
# "https://thehackernews.com/",
#         "https://krebsonsecurity.com/",
#         "https://www.cyberscoop.com/"

In [7]:
def CS(user_input):
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    from bs4 import BeautifulSoup
    import nltk
    import re
    import pickle
    from tensorflow.keras.models import load_model
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    
    # 📌 download NLTK resources if you haven’t already
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    
    
    # 🔷 Load model & tokenizer
    model = load_model("CS_model.h5")
    with open("tokenizer.pkl", "rb") as f:
        tokenizer = pickle.load(f)
    
    # 🔷 Config
    max_len = 50
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    cs_keywords = [
        'cyber', 'hack', 'breach', 'malware', 'phishing', 'ransomware',
        'infrastructure', 'ddos', 'security', 'data', 'attack', 'vulnerability',
        'privacy', 'leak', 'spyware', 'exploit', 'threat', 'database'
    ]
    
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-z\s]', ' ', text)
        tokens = nltk.word_tokenize(text)
        tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
        return ' '.join(tokens)
    
    # 🔷 Input websites
    # print("🌐 Enter websites to scrape, separated by commas:")
    # user_input = input("👉 ").strip()
    sites = [url.strip() for url in user_input.split(",") if url.strip()]
    
    if not sites:
        print("⚠️ No websites entered. Exiting.")
        exit()
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    for url in sites:
        print(f"\n📡 Scraping: {url}")
        try:
            driver.get(url)
            driver.implicitly_wait(5)
            html = driver.page_source
    
            soup = BeautifulSoup(html, "html.parser")
            all_headlines = [
                h.text.strip()
                for tag in ["h1", "h2", "h3", "h4", "h5", "h6"]
                for h in soup.find_all(tag)
            ]
    
            # 🔷 Filter CS-related only
            headlines = [h for h in all_headlines if any(kw in h.lower() for kw in cs_keywords)]
    
            if not headlines:
                print("⚠️ No CS-related headlines found.")
            else:
                print(f"✅ Found {len(headlines)} CS-related headlines.\n")
    
                for text in headlines:
                    cleaned = clean_text(text)
                    seq = tokenizer.texts_to_sequences([cleaned])
                    padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
                    pred = model.predict(padded, verbose=0)
                    label = 'Real ✅' if int(pred[0][0] <= 0.5) else 'Fake 🚨'
                    print(f"📰 {text}")
                    print(f"   → Predicted: {label}\n")
        except Exception as e:
            print(f"❌ Error scraping {url}: {e}")
    
    driver.quit()


In [9]:
CS("https://www.cyberscoop.com/")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


🌐 Enter websites to scrape, separated by commas:

📡 Scraping: https://www.cyberscoop.com/
✅ Found 32 CS-related headlines.

📰 Featured on CyberScoop
   → Predicted: Real ✅

📰 United Natural Foods loses up to $400M in sales after cyberattack
   → Predicted: Real ✅

📰 UK sanctions Russian hackers, spies as US weighs its own punishments for Russia
   → Predicted: Real ✅

📰 Senate Democrats seek answers on Trump overhaul of immigrant database to find noncitizen voters
   → Predicted: Real ✅

📰 Ryuk ransomware operator extradited to US, faces five years in federal prison
   → Predicted: Real ✅

📰 House hearing will use Stuxnet to search for novel ways to confront OT cyberthreats
   → Predicted: Real ✅

📰 SonicWall customers hit by fresh, ongoing attacks targeting fully patched SMA 100 devices
   → Predicted: Real ✅

📰 Pro-Russian DDoS group NoName057(16) disrupted by international law enforcement operation
   → Predicted: Real ✅

📰 Former Army soldier pleads guilty to widespread attack spre