In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize


In [29]:

url = "https://edition.cnn.com/2024/05/13/tech/australia-us-undersea-drones-ml-hnk-intl/index.html"  

response = requests.get(url)


if response.status_code == 200:
    html_content = response.text  
else:
    print(f"Failed to retrieve {url}")
html_content  





In [30]:
soup = BeautifulSoup(html_content, 'html.parser')

paragraphs = soup.find_all('p')
article_text = ' '.join([para.get_text() for para in paragraphs])

article_text


'Markets \n\n\n Hot Stocks \n\n\n Fear & Greed Index \n\n\n \n            Latest Market News \n\n\n \n            Hot Stocks \n\n\n \n            Ghost Shark and Manta Ray protect the undersea realm. Sounds like the plot of a future Marvel movie, but in actual fact, it’s what could be the future of Pacific naval defenses.\n     \n            Ghost Shark and Manta Ray are the names of prototype uncrewed underwater vehicles – UUVs or drones – introduced recently by Australia and the United States respectively.\n     \n            Experts say the submersibles could represent the future of undersea warfare, showing the ability to exert power\xa0while minimizing the danger to human life.\n     \n            The use of drones in aerial warfare has become commonplace. The US used them extensively during conflicts in Iraq and Afghanistan beginning in the 1990s, and newer, cheaper drones have become key pieces of military hardware for both sides in Russia’s invasion of Ukraine.\n     \n        

In [31]:

with open("scraped_article.txt", "w") as f:
    f.write(article_text)

In [32]:
nltk.download('punkt')
nltk.download('stopwords')

with open("scraped_article.txt", "r") as f:
    article_text = f.read()

sentences = sent_tokenize(article_text)

stop_words = set(stopwords.words('english'))
cleaned_sentences_nltk = []

for sentence in sentences:
    words = word_tokenize(sentence.lower())
    words = [word for word in words if word not in stop_words and word not in string.punctuation]
    cleaned_sentences_nltk.append(' '.join(words))

print(cleaned_sentences_nltk)



['markets hot stocks fear greed index latest market news hot stocks ghost shark manta ray protect undersea realm', 'sounds like plot future marvel movie actual fact ’ could future pacific naval defenses', 'ghost shark manta ray names prototype uncrewed underwater vehicles – uuvs drones – introduced recently australia united states respectively', 'experts say submersibles could represent future undersea warfare showing ability exert power minimizing danger human life', 'use drones aerial warfare become commonplace', 'us used extensively conflicts iraq afghanistan beginning 1990s newer cheaper drones become key pieces military hardware sides russia ’ invasion ukraine', 'kyiv also built naval surface drones inflicted heavy losses much larger expensive ships russia ’ black sea fleet', 'aerial surface sea drones controlled using satellites light radio waves', '’ function way depths', '2023 study published swiss journal sensors points underwater communications require energy still see signif

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\flori\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\flori\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
corpus = cleaned_sentences_nltk

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np

cvect = CountVectorizer(ngram_range=(1,1), token_pattern='(?u)\\b\\w+\\b')
counts = cvect.fit_transform(cleaned_sentences_nltk)
normalized_counts = normalize(counts, norm='l1', axis=1)

tfidf = TfidfVectorizer(ngram_range=(1,1))
tfs = tfidf.fit_transform(cleaned_sentences_nltk)

sentence_scores = np.sum(tfs.toarray(), axis=1)

num_sentences = 5
top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:]

# Sélection des phrases les plus importantes
summary = [sentences[i] for i in sorted(top_sentence_indices)]

# Affichage du résumé extractif
print("Résumé extractif:")
for sentence in summary:
    print(sentence)

Résumé extractif:
But when Australia unveiled Ghost Shark last month, it called the prototypes “the most advanced undersea autonomous vehicles in the world.”
     
            “Ghost Shark will provide Navy with a stealthy, long-range autonomous undersea warfare capability that can conduct persistent intelligence, surveillance, reconnaissance (ISR) and strike,” a statement from the Australian Defense Ministry said, adding that it expects the first production models to be delivered by the end of next year.
Related article
A Ukrainian pilot outlines how drones powered by jet skis sunk a Russian warship
 
            “I assume that they are all intended for roughly similar mission sets – persistent intelligence, surveillance, reconnaissance and strike capability, particularly in the anti-submarine domain,” Salisbury said.
The US Navy called the Boeing-built Orca UUV “a cutting-edge, autonomous, unmanned diesel-electric submarine with a modular payload section to execute a variety of missi

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = CountVectorizer().fit_transform(summary)
vectors = vectorizer.toarray()
cosine_matrix = cosine_similarity(vectors)

threshold = 0.7
redundant_sentences = set()

for i in range(len(cosine_matrix)):
    for j in range(i + 1, len(cosine_matrix)):
        if cosine_matrix[i][j] > threshold:
            redundant_sentences.add(j)


filtered_summary = [sentence for idx, sentence in enumerate(summary) if idx not in redundant_sentences]

print("Résumé post-traité:")
for sentence in filtered_summary:
    print(sentence)

Résumé post-traité:
But when Australia unveiled Ghost Shark last month, it called the prototypes “the most advanced undersea autonomous vehicles in the world.”
     
            “Ghost Shark will provide Navy with a stealthy, long-range autonomous undersea warfare capability that can conduct persistent intelligence, surveillance, reconnaissance (ISR) and strike,” a statement from the Australian Defense Ministry said, adding that it expects the first production models to be delivered by the end of next year.
Related article
A Ukrainian pilot outlines how drones powered by jet skis sunk a Russian warship
 
            “I assume that they are all intended for roughly similar mission sets – persistent intelligence, surveillance, reconnaissance and strike capability, particularly in the anti-submarine domain,” Salisbury said.
The US Navy called the Boeing-built Orca UUV “a cutting-edge, autonomous, unmanned diesel-electric submarine with a modular payload section to execute a variety of mis