In [3]:
# text = "Forecasters warn of strong wind gusts that could bring down tree limbs and cause power outages. It may also create little to near-zero visibility on the roads. The current storm has wreaked havoc for communities in and around the Sierra Nevada. As of Sunday morning, some 13,000 electric customers in California and Nevada were without power, according to Poweroutage.us. The number of customers experiencing outages in those two states was more than quadruple on Saturday. A section of Interstate 80 remains closed between the California-Nevada state line and the city of Colfax. On Sunday, the California Highway Patrol said there is 'still no estimated time of reopening the freeway.' According to the CHP, 'a mass amount of vehicles' were stuck over Donner Summit on Friday night. Due to the storm, the CHP said it took emergency personnel and tow trucks 'several hours' to reach motorists."
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
p = {'searchTerm':'"natural disaster"','numResults':'10'}
def get_npr_stories(p):
    # Send a GET request to the NPR API
    r = requests.get("http://api.npr.org/query?apiKey=MDE5Mzg3Mjc2MDE0MzMyMjM3NjM5ZTI2Ng001", params=p)

    # Parse the XML response to get the story URLs
    root = ET.fromstring(r.content)
    story_urls = [story.find('link').text for story in root.iter('story')]

    # For each story URL, send a GET request to get the HTML content
    full_stories = []
    for url in story_urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the main content of the story. This will depend on the structure of the webpage.
        # Here, we're assuming that the main content is in a <p> tag. You might need to adjust this depending on the webpage structure.
        story = soup.find_all('p')

        # Extract the text from the story
        full_story = ' '.join(p.text for p in story)
        full_stories.append(full_story)
    return full_stories

ModuleNotFoundError: No module named 'requests'

In [None]:

import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
nlp = spacy.load('en_core_web_sm')
n_gram_range = (1, 2)
stop_words = "english"

def featurize_stories(text,top_k):
    # Extract candidate words/phrases
    count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])
    all_candidates = count.get_feature_names_out()
    doc = nlp(text)
    noun_phrases = set(chunk.text.strip().lower() for chunk in doc.noun_chunks)
    nouns = set()
    for token in doc:
        if token.pos_ == "NOUN":
            nouns.add(token.text)

    all_nouns = nouns.union(noun_phrases)
    candidates = list(filter(lambda candidate: candidate in all_nouns, all_candidates))

    from transformers import AutoModel, AutoTokenizer
    model_name = "distilroberta-base"
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    candidate_tokens = tokenizer(candidates, padding=True, return_tensors="pt")
    candidate_embeddings = model(**candidate_tokens)["pooler_output"]

    text_tokens = tokenizer([text], padding=True, return_tensors="pt")
    text_embedding = model(**text_tokens)["pooler_output"]
    candidate_embeddings = candidate_embeddings.detach().numpy()
    text_embedding = text_embedding.detach().numpy()
    distances = cosine_similarity(text_embedding, candidate_embeddings)
    
    return [candidates[index] for index in distances.argsort()[0][-top_k:]]


In [None]:

import nltk
nltk.download('punkt')
import string

max_len = 512
sentences = nltk.sent_tokenize(full_stories[1])
clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).replace('\n', ' ') for s in full_stories]
words = [word for s in sentences for word in nltk.word_tokenize(s.replace('\n', ' '))]
chunks = [' '.join(words[i:i + max_len]) for i in range(0, len(words), max_len)]
