<a href="https://colab.research.google.com/github/dcolinmorgan/dots/blob/main/featurize_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## adapted from https://jaketae.github.io/study/keyword-extraction/#candidate-selection
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoModel, AutoTokenizer
import torch

model_name = "distilroberta-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
nlp = spacy.load('en_core_web_sm')
n_gram_range = (1, 2)
stop_words = "english"
embeddings=[]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

import nltk, string, numpy as np
nltk.download('punkt')

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
r = requests.get("http://api.=1m.org/query?apiKey="+npr_key[0], params=p)

# Parse the XML response to get the story URLs
root = ET.fromstring(r.content)
story_urls = [story.find('link').text for story in root.iter('story')]

# For each story URL, send a GET request to get the HTML content
full_stories = []
for url in story_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the main content of the story. This will depend on the structure of the webpage.
    # Here, we're assuming that the main content is in a <p> tag. You might need to adjust this depending on the webpage structure.
    story = soup.find_all('p')

    # Extract the text from the story
    full_story = ' '.join(p.text for p in story)
    full_stories.append(full_story)
return full_stories

In [None]:
requests.get("http://api.=1m.org/query?apiKey="+npr_key[0], params='violence')

In [None]:
from bs4 import BeautifulSoup
import requests,os
import xml.etree.ElementTree as ET
npr_key=([os.environ['npr_key']])
for i in ['"extreme-weather"']#,'"natural-disaster"','"epidemic"','"shooting"']:
    p = {'searchTerm':i,'numResults':'50'}
    fs=(get_npr_news(p="extreme-weather"))

In [None]:
get_npr_news(p="extreme-weather")

In [None]:
def chunk_text(text, max_len):
    # Tokenize the text into tokens
    tokens = nltk.word_tokenize(text)

    # Calculate the number of chunks and the size of the final chunk
    num_chunks = len(tokens) // max_len
    final_chunk_size = len(tokens) % max_len

    # If the final chunk is too small, distribute its tokens among the other chunks
    if final_chunk_size < max_len / 2:
        num_chunks += 1
        chunk_sizes = [len(tokens) // num_chunks + (1 if i < len(tokens) % num_chunks else 0) for i in range(num_chunks)]
        chunks = [tokens[sum(chunk_sizes[:i]):sum(chunk_sizes[:i+1])] for i in range(num_chunks)]
    else:
        chunks = [tokens[i:i + max_len] for i in range(0, len(tokens), max_len)]

    return chunks


In [None]:
from torch.utils.data import DataLoader

def featurize_stories(text, max_len, top_k):
    # Extract candidate words/phrases
    count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])
    all_candidates = count.get_feature_names_out()
    doc = nlp(text)
    noun_phrases = set(chunk.text.strip().lower() for chunk in doc.noun_chunks)
    nouns = set()
    for token in doc:
        if token.pos_ == "NOUN":
            nouns.add(token.text)

    all_nouns = nouns.union(noun_phrases)
    candidates = list(filter(lambda candidate: candidate in all_nouns, all_candidates))
    candidate_tokens = tokenizer(candidates, padding=True, return_tensors="pt")
    candidate_tokens = {k: v.to(device) for k, v in (candidate_tokens).items()}
    candidate_embeddings = model(**candidate_tokens)["pooler_output"]
    candidate_embeddings = candidate_embeddings.detach()#.to_numpy()

    # words = nltk.word_tokenize(text)
    # chunks = [words[i:i + 512] for i in range(0, len(words), 512)]
    chunks = chunk_text(text, max_len)  # use this to chunk better and use less padding thus less memory but also less affect from averging

    for chunk in chunks:
        text_tokens = tokenizer(chunk, padding=True, return_tensors="pt")
        text_tokens = {k: v.to(device) for k, v in (text_tokens).items()}
        text_embedding = model(**text_tokens)["pooler_output"]
        text_embedding = text_embedding.detach()#.to_numpy()
        embeddings.append(text_embedding)
    max_emb_shape = max(embedding.shape[0] for embedding in embeddings)
    padded_embeddings = [np.pad(embedding.cpu(), ((0, max_emb_shape - embedding.shape[0]), (0, 0))) for embedding in embeddings]
    avg_embedding = np.min(padded_embeddings, axis=0)
    distances = cosine_similarity(avg_embedding, candidate_embeddings.cpu())
    torch.cuda.empty_cache()
    return [candidates[index] for index in distances.argsort()[0][::-1][-top_k:]]



In [None]:
import pandas as pd
# data=pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet.csv')
data=pd.read_csv('/content/drive/MyDrive/consult/Louie_california_weather.csv')

In [None]:
data['text']

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features.txt',sep='\t')


In [None]:
rank_articles=[]
from tqdm import tqdm
# for i in tqdm(range(len(df),len(data['text']))):
dataloader = DataLoader(data['text'], batch_size=32, shuffle=True, num_workers=4)
for i in tqdm(range(len(dataloader))):
    try:
        cc=featurize_stories(data['text'][i], max_len=512, top_k=4)
        # print(cc)
        rank_articles.append(cc)
    except IndexError:
        pass

In [None]:
flattened_list = [item for sublist in rank_articles for item in sublist]
from collections import Counter
counter = Counter(flattened_list)
df = pd.DataFrame.from_dict(counter, orient='index', columns=['Count'])

df = df.sort_values(by='Count',ascending=False)
# df.to_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features2.txt',sep='\t')
df.to_csv('/content/drive/MyDrive/consult/california_weather_tweet_features.txt',sep='\t')

print(len(df))
# df[:25]


In [None]:
len(rank_articles)

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features.txt',sep='\t')
# df2=pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features2.txt',sep='\t')

# df=pd.concat([df,df2])
# print(df['Unnamed: 0'])
# df = df.groupby('Unnamed: 0').sum().sort_values(by='Count',ascending=False)
# df=df[df['Count']>int(np.round(len(df)*.001))]

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

# nouns = ['apple', 'John', 'London', 'dog', 'Mary', 'Paris', 'banana']
nouns= df.reset_index()['Unnamed: 0'].to_list()
doc = nlp(' '.join(nouns))

proper_nouns = [token.text for token in doc if token.pos_ == 'PROPN']

print(proper_nouns)

In [None]:
print(len(proper_nouns))
proper_nouns

In [None]:
len(nouns)

# run from module

In [None]:
# %%time
# !pip install --quiet git+https://github.com/dcolinmorgan/dots.git

In [None]:
import sys,os,argparse,csv
sys.argv = ['dots_feat.py', '-n', '100', '-f', '3', '-o', 'dots_feats.csv', '-s', '1']
parser = argparse.ArgumentParser(description='Process OS data for dynamic features.')
parser.add_argument('-n', type=int, default=10, help='Number of data items to get')
parser.add_argument('-f', type=int, default=3, help='Number of features per item to get')
parser.add_argument('-o', type=str, default='dots_feats.csv', help='Output file name')
parser.add_argument('-s', type=int, default=1, help='Parallelize requests')
args, unknown = parser.parse_known_args()

from dots.dots_feat import featurize_stories, process_data, get_OS_data, process_hit

In [None]:
from google.colab import userdata
ost=userdata.get('OS_TOKEN')
text_file = open("/usr/local/lib/python3.10/dist-packages/dots/.env", "w")
text_file.write('OS_TOKEN='+ost)
text_file.close()

In [None]:
import logging
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)

In [None]:
data = get_OS_data(100)
articles = process_data(data)

In [None]:
rank_articles=[]
for i in tqdm(articles):
    parts=str(i).split('[', 3)
    try:
        cc=featurize_stories(str(i), top_k = args.f, max_len=512)
        cc.append(parts[1])
        rank_articles.append(cc)
    except Exception as e:
         logging.error(f"Failed to process article: {e}")
with open(args.o, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rank_articles)

In [None]:
rank_articles

In [None]:
flattened_list = [item for sublist in rank_articles for item in sublist]
list_without_brackets = [s.replace("[", "").replace("]", "") for s in flattened_list]

from collections import Counter
counter = Counter(list_without_brackets)

In [None]:
import pandas as pd
df = pd.DataFrame.from_dict(counter, orient='index', columns=['Count'])
df = df.sort_values(by='Count',ascending=False)
df = df[~df.index.str.contains("united states")]
df = df[~df.index.str.contains("United States")]
df = df[~df.index.str.contains("None")]

df[:25]

In [None]:
filtered_list = [sublist for sublist in rank_articles if any('fire'.lower() in s.lower() for s in sublist)]


In [None]:
filtered_list

In [None]:
!git clone https://github.com/dcolinmorgan/dots.git
# !python dots/dots/dots_feat.py -n 100 -f 5 -s 1 -o dots_feat.txt

In [None]:
!pytest dots/dots/test_dots_feat.py