In [5]:
import feedparser
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request

# This function returns whether or not a particular tag is visible to the viewer of a web page
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

# This function extracts the readable content text from a webpage
def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.find_all('p', text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.get_text() for t in visible_texts)

# This is a list of the RSS feeds we are subscribing to for security news
newsSources = {
    'The Hacker News': 'https://feeds.feedburner.com/TheHackersNews?format=xml',
    #'Graham Cluley': 'https://www.grahamcluley.com/feed/',
    'Krebs on Security': 'http://krebsonsecurity.com/feed/',
    'Threatpost': 'https://threatpost.com/feed/',
    'Naked Security': 'https://nakedsecurity.sophos.com/feed/'
}

articleTexts = []

for title, source in newsSources.items():

    print("Getting News Source", title)
    # For every news source, we get the RSS feed. 
    feed = feedparser.parse(source)

    for article in feed['items']:

        print("Getting article information", article['title'])
        #For every article in the feed, we open the web page
        html = urllib.request.urlopen(article['link']).read()

        #Then we extract the text from the web page and put it in an object
        articleTexts.append({
            "title" : article['title'],
            "body" : text_from_html(html)
        })
        #break # Remove to loop through all sources; currently we just get one article from each source



Getting News Source The Hacker News
Getting article information Google Teams Up with Ecosystem Partners to Enhance Security of SoC Processors
Getting article information How to Tackle the Top SaaS Challenges of 2023
Getting article information How to Use AI in Cybersecurity and Avoid Being Trapped
Getting article information CISA Sounds Alarm on Cybersecurity Threats Amid Russia's Invasion Anniversary
Getting article information Even Top-Ranked Android Apps in Google Play Store Provide Misleading Data Safety Labels
Getting article information Hackers Using Trojanized macOS Apps to Deploy Evasive Cryptocurrency Mining Malware
Getting article information Experts Sound Alarm Over Growing Attacks Exploiting Zoho ManageEngine Products
Getting article information The Secret Vulnerability Finance Execs are Missing
Getting article information New Hacking Cluster 'Clasiopa' Targeting Materials Research Organizations in Asia
Getting article information Lazarus Group Likely Using New WinorDLL64 B

In [34]:
def train_and_eval_cluster(data, num_clusters=None, svd_dimensions=300):
    
    from sklearn.decomposition import TruncatedSVD
    from sklearn.metrics import calinski_harabasz_score
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import Normalizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.cluster import KMeans

    #Vectorize the data, using english stop words
    vectorizer = TfidfVectorizer(min_df=2, max_df=0.3, stop_words='english', use_idf=True)
    trans_data = vectorizer.fit_transform(data)
    print("Transformed data contains " + str(trans_data.shape[0]) + " with " + str(trans_data.shape[1]) + " features ")

    #SVD and Normalize the data to reduce the number of features we need to train on and improve training time
    svd = TruncatedSVD(svd_dimensions)
    pipe = make_pipeline(svd, Normalizer(copy=False))
    reduced_data = pipe.fit_transform(trans_data)

    #Cluster the data

    km = None
    final_num_clusters = 0
    final_labels = []

    if (num_clusters == None):

        # They want us to pick the number of clusters
        all_ch_scores = []
        for n in range(2, 20):
            km = KMeans(n_clusters=n, init='k-means++', max_iter=100, random_state=0)
            labels = km.fit_predict(reduced_data)
            ch_score = calinski_harabasz_score(reduced_data, labels)
            all_ch_scores.append((n, ch_score))
        
        max_ch_score = max(all_ch_scores, key=lambda x: x[1])
        print("The Best CH score was " + str(max_ch_score[1]) + " for " + str(max_ch_score[0]) + " clusters.")
        final_num_clusters = max_ch_score[0]
        km = KMeans(n_clusters=final_num_clusters, init='k-means++', max_iter=100, random_state=0)
        final_labels = km.fit_predict(reduced_data)

    else:

        # They've chosen the number
        final_num_clusters = num_clusters
        km = KMeans(n_clusters=final_num_clusters, init='k-means++', max_iter=100, random_state=0)
        final_labels = km.fit_predict(reduced_data)
        ch_score = calinski_harabasz_score(reduced_data, labels)
        

    #Evaluate manually
    print("\nMost discriminative words per cluster:")
    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(final_num_clusters):
        print("Cluster " + str(i) + ": ")
        cl_terms = ""
        for ind in order_centroids[i, :50]:
            cl_terms += terms[ind] + " "
        print(cl_terms + "\n")

    labeled_data = list(zip(data, final_labels))
    return labeled_data, final_num_clusters


In [35]:
import random
random.seed(42)
articleBodies = [article['body'] for article in articleTexts]
random.shuffle(articleBodies)

labeled_articles, clusters = train_and_eval_cluster(articleBodies)
labeled_articles[0]
print(clusters)

Transformed data contains 80 with 2652 features 
The Best CH score was 2.1753903711164346 for 3 clusters.

Most discriminative words per cluster:
Cluster 0: 
content sponsored threatpost group written sponsor community share article scanbox ib cameras devices researchers thousands pan insight campaigns travel os subject wrote contribution audience editorial edited strives infosec july team participate commentary quality voice advertiser matter highest trusted opportunity insider topics prolific experts according patch campaign objective writing editing creates 

Cluster 1: 
fraud credit people ai account posts card spam service russia development cyber carding badguy stolen russian accounts cards uses money servers transactions id network facebook government suspected series breach pc testing twitter cloud number criminal don nation bitcoin services android ransomware bad image followers ransom way early blue alternative solutions 

Cluster 2: 
apps files server google execution remote



In [36]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

from transformers import pipeline

updated_punctuation = punctuation + "”"

def extraction_summarize(text, target_length):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    word_frequencies = {}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in updated_punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word] / max_frequency
    sentence_tokens = [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word.text.lower()]
    sentence_tuples = [(sentence, score) for sentence, score in sentence_scores.items()]
    sentence_tuples.sort(key = lambda x: x[1], reverse=True)
    summary = ""

    sentence_number = 0

    while (len(summary.split()) < target_length):
        summary += sentence_tuples[sentence_number][0].text
        sentence_number += 1
        
    return summary

# This function performs abstraction summarization on a given text, using neural networks to 
# write a whole new summary for the given text.
def abstraction_summarize(original_text):
    summarizer = pipeline("summarization")
    summary_text = summarizer(original_text, min_length = 100)
    return summary_text[0]['summary_text']

In [38]:
# Summarize each topic

for topic in range(clusters):
    text_body = " "
    
    for article, label in labeled_articles:
        if label == topic:
            text_body += article + " "
    
    print(abstraction_summarize(extraction_summarize(text_body, 600)))

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


 TA423’s attacks began with phishing emails, with such titles as “Sick Leave,” “User Research” and “Request Cooperation” The personal information that was accessed in the Nelnet breach “has potential to be leveraged in future social engineering and phishing campaigns . Lockbit was by far the most prolific ransomware gang in July, behind 62 attacks, researchers have determined . The authors of the report could only speculate that “Chinese threat groups such as MISSION2025/APT41, APT10 and its affiliates, as well as unknown Russian threat actor groups could potentially exploit vulnerabilities in these devices .


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.




No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


 Sophos Home protects every Mac and PC in your home . In this bug, firing the same encrypted message over and over again at a server, but modifying the padding at the end of the data to make the data invalid, and thus provoking some sort of unpredictable behaviour . All I’m hoping is that, given that there’s not much we can advise people about now because we have no indicators of compromise, and we don’t even know whether, at this remove, GoDaddy has been able to come up with what people could go and look for .
