In [1]:
#Importation des bibliothèques nécessaires
import requests
from bs4 import BeautifulSoup
import pandas as pd

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import re



In [2]:
#on nettoie l'URL afin d'éviter un bug à la lecture (caractères spéciaux)
def clean_filename(url):
    return re.sub(r'[\/:*?"<>|]', '_', url)

In [None]:
#on ouvre le document texte pour obtenir l'ensemble des URL
with open("technology.txt", "r") as f:
    url_list = f.read().splitlines()


for url in url_list:
    #Test des URL et récupération du contenu HTML
    response = requests.get(url)
    if response.status_code == 200:
        html_content = response.text  
    else:
        print(f"Failed to retrieve {url}")
    html_content  
    soup = BeautifulSoup(html_content, 'html.parser')
    paragraphs = soup.find_all('p')
    article_text = ' '.join([para.get_text() for para in paragraphs])

    #on écrit le contenu de l'article dans un fichier texte
    with open(f"scraped_article_{clean_filename(url)}.txt", "w") as f:
        f.write(article_text)

    #on tokenise le texte en phrases
    nltk.download('punkt')
    nltk.download('stopwords')

    with open(f"scraped_article_{clean_filename(url)}.txt", "r") as f:
        article_text = f.read()

    sentences = sent_tokenize(article_text)

    total_sentences = len(sentences)
    num_sentences = int(max(1, int(total_sentences * 0.3)))

    stop_words = set(stopwords.words('english'))
    cleaned_sentences_nltk = []

    #nettoyage des phrases avec le tokenizer de nltk

    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [word for word in words if word not in stop_words and word not in string.punctuation]
        cleaned_sentences_nltk.append(' '.join(words))

    print(cleaned_sentences_nltk)

    #extraction des phrases les plus importantes à l'aide de la méthode TF-IDF
    cvect = CountVectorizer(ngram_range=(1,1), token_pattern='(?u)\\b\\w+\\b')
    counts = cvect.fit_transform(cleaned_sentences_nltk)
    normalized_counts = normalize(counts, norm='l1', axis=1)

    tfidf = TfidfVectorizer(ngram_range=(1,1))
    tfs = tfidf.fit_transform(cleaned_sentences_nltk)

    sentence_scores = np.sum(tfs.toarray(), axis=1)

    top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:] #indices des phrases les plus importantes

    # Sélection des phrases les plus importantes à l'aide des top_sentence_indices
    summary = [sentences[i] for i in sorted(top_sentence_indices)]

    
    vectorizer = CountVectorizer().fit_transform(summary)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)

    threshold = 0.7 #seuil de similarité entre les phrases
    redundant_sentences = set() #ensemble des phrases redondantes

    for i in range(len(cosine_matrix)):
        for j in range(i + 1, len(cosine_matrix)):
            if cosine_matrix[i][j] > threshold:
                redundant_sentences.add(j) #ajout des phrases redondantes

    #ecriture du résumé filtré dans un fichier texte
    filtered_summary = [sentence for idx, sentence in enumerate(summary) if idx not in redundant_sentences]
    
    with open(f"summary_{clean_filename(url)}.txt", "w") as f:
        for sentence in filtered_summary:
            f.write(sentence + "\n")
        f.close()