In [1]:
import pandas as pd

import os
import requests
import re
import ast

from nltk.corpus import names
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
def get_news(q, df=True, from_date=None, to_date=None, language=None, searchIn=None, page_size=None, page=None):
    endpoint = 'https://newsapi.org/v2/everything'
    params = {
        'apiKey': os.getenv('NEWS_API_KEY'),
        'pageSize': page_size,
        'page': page
    }

    query_params = {
        'q': q,
        'from': from_date,
        'to': to_date,
        'language': language,
        'searchIn': searchIn
    }

    for key, value in query_params.items():
        if value is not None:
            params[key] = value

    response = requests.get(endpoint, params=params)
    response.raise_for_status()
    print(response.url)

    if df:
        return pd.json_normalize(pd.DataFrame(response.json())['articles'])
    else:
        return response.json()

In [19]:
df = get_news(
    q=f"{'water+climate+change+river+drinking'}", 
    df=True, 
    from_date='2024-09-06', 
    to_date='2024-10-05', 
    language='en',
    page_size=100, 
    page=1
)

display(df)

https://newsapi.org/v2/everything?apiKey=935933a6b0ce488f86af5022580c4f62&pageSize=100&page=1&q=water%2Bclimate%2Bchange%2Briver%2Bdrinking&from=2024-09-06&to=2024-10-05&language=en


Unnamed: 0,author,title,description,url,urlToImage,publishedAt,content,source.id,source.name
0,Lauren Sommer,Hurricanes are dangerous far from the coast. C...,Coastal cities often bear the brunt of hurrica...,https://www.npr.org/2024/10/01/nx-s1-5133530/h...,https://npr.brightspotcdn.com/dims3/default/st...,2024-10-02T09:00:00Z,Hurricane Helenes destructive path tore across...,,NPR
1,,New tool to help decision-makers navigate poss...,The Colorado River is a vital source of water ...,https://www.sciencedaily.com/releases/2024/09/...,https://www.sciencedaily.com/images/scidaily-i...,2024-09-20T20:08:11Z,The Colorado River is a vital source of water ...,,Science Daily
2,Matthew Carroll,New tool to help decision makers navigate poss...,The Colorado River is a vital source of water ...,https://phys.org/news/2024-09-tool-decision-ma...,https://scx2.b-cdn.net/gfx/news/hires/2024/new...,2024-09-21T15:05:35Z,The Colorado River is a vital source of water ...,,Phys.Org
3,"Ariel Wittenberg, E&E News","Hurricanes Helene’s Floods Swamped a Hospital,...",Hurricane Helene forced dozens of medical faci...,https://subscriber.politicopro.com/article/een...,https://static.scientificamerican.com/dam/m/78...,2024-10-02T18:45:00Z,CLIMATEWIRE | A dramatic helicopter evacuation...,,Politicopro.com
4,Al Jazeera,Water levels in major Amazon tributary tumble ...,Climate change and below-average rainfall have...,https://www.aljazeera.com/gallery/2024/10/5/wa...,https://www.aljazeera.com/wp-content/uploads/2...,2024-10-05T01:49:16Z,Its one of the largest rivers in the world. An...,al-jazeera-english,Al Jazeera English
...,...,...,...,...,...,...,...,...,...
86,Dariel Pradas,A Cuban Town Improves Water Quality Through De...,"Overnight, hundreds of people in the rural com...",https://www.ipsnews.net/2024/09/cuban-town-imp...,https://www.ipsnews.net/Library/2024/09/Agua-1...,2024-09-09T15:44:14Z,"Editors' Choice, Featured, Headlines, Health, ...",,Inter Press Service
87,Khabarhub,Langtang National Park: Understanding climate ...,Langtang National Park — Nepal’s High Mountain...,https://english.khabarhub.com/2024/18/397905/,https://english.khabarhub.com/wp-content/uploa...,2024-09-18T06:15:48Z,Langtang National Park — Nepal’s High Mountain...,,Khabarhub.com
88,Oritro Karim,Typhoon Yagi Devastates Southeast Asia,"In early September, Typhoon Yagi, a deadly tro...",https://www.ipsnews.net/2024/09/typhoon-yagi-d...,https://www.ipsnews.net/Library/2024/09/The-af...,2024-09-19T09:57:08Z,"Asia-Pacific, Climate Change, Economy &amp; Tr...",,Inter Press Service
89,Dima Al-Khatib,A Better Tomorrow with South-South Cooperation,The annual United Nations Day for South-South ...,https://www.ipsnews.net/2024/09/better-tomorro...,https://www.ipsnews.net/Library/2024/09/A-Bett...,2024-09-12T06:17:13Z,"Civil Society, Development &amp; Aid, Economy ...",,Inter Press Service


In [22]:
# Function to parse the content of each article
def parse_article_content(url):
    print(url)
    try:
        response = requests.get(url, timeout=10)  # Set a timeout of 10 seconds
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
        return None
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the text from the article
    paragraphs = soup.find_all('p')
    article_text = ' '.join([para.get_text() for para in paragraphs])
    words = re.findall(r'\b\w+\b', article_text)
    return words

# Apply the function to each URL in the dataframe
df['parsed_content'] = df['url'].apply(parse_article_content)

https://www.npr.org/2024/10/01/nx-s1-5133530/hurricane-helene-rain-flooding-climate-change
https://www.sciencedaily.com/releases/2024/09/240920160811.htm
403 Client Error: Forbidden for url: https://www.sciencedaily.com/releases/2024/09/240920160811.htm
https://phys.org/news/2024-09-tool-decision-makers-futures-colorado.html
400 Client Error: Bad request for url: https://phys.org/news/2024-09-tool-decision-makers-futures-colorado.html
https://subscriber.politicopro.com/article/eenews/2024/10/02/a-dangerous-rescue-helene-made-a-hospital-almost-inescapable-00181734
https://www.aljazeera.com/gallery/2024/10/5/water-levels-in-major-amazon-tributary-tumble-to-record-lows-amid-drought
https://phys.org/news/2024-09-amazon-river-dries-hellish-villagers.html
400 Client Error: Bad request for url: https://phys.org/news/2024-09-amazon-river-dries-hellish-villagers.html
https://www.aljazeera.com/news/2024/9/19/storm-boris-floods-northern-italy-as-leaders-to-discuss-eu-aid-in-poland
https://phys.or

In [23]:
df = df.dropna(subset=['parsed_content'])
df = df.drop_duplicates(subset=['title'])
display(df['parsed_content'])

df.to_csv('data/water+climate+change+river+drinking_news.csv', index=False)

0     [Lauren, Sommer, Extreme, rain, is, becoming, ...
3                                                    []
4     [In, Pictures, It, s, one, of, the, largest, r...
6     [Italy, s, Emilia, Romagna, region, sees, thre...
8     [Yves, here, Yours, truly, has, been, under, r...
11    [Longer, Reads, provide, in, depth, analysis, ...
13    [Water, levels, in, many, of, the, rivers, in,...
15    [Read, The, Diplomat, Know, The, Asia, Pacific...
16    [Plus, Adam, Neumann, s, climate, company, is,...
19    [Print, Severe, flooding, in, Chad, since, Jul...
24    [Print, The, death, toll, in, the, aftermath, ...
25    [Only, the, youngest, and, strongest, villager...
27    [HENDERSONVILLE, N, C, AP, Hurricane, Helene, ...
28    [The, completion, of, a, new, 59, million, res...
30    [There, have, been, calls, for, the, Governmen...
31    [Most, Widely, Read, Newspaper, An, aerial, vi...
32    [Imperial, Oil, and, Sahtu, leaders, are, lock...
33    [Artificial, Intelligence, AI, AI, for, Ev

In [24]:
def clean_content(content):
    content = [word.lower() for word in content]
    stopwords_list = stopwords.words('english')
    names_list = [name.lower() for name in names.words()]
    content = [word for word in content if word not in stopwords_list and word not in names_list]
    content = [re.sub(r'[^a-z]', '', word) for word in content if len(word) > 2]
    content = word_tokenize(' '.join(content))
    return content

# Apply the cleaning function to the 'parsed_content' column
df['cleaned_content'] = df['parsed_content'].apply(clean_content)

display(df[['parsed_content', 'cleaned_content']])

Unnamed: 0,parsed_content,cleaned_content
0,"[Lauren, Sommer, Extreme, rain, is, becoming, ...","[sommer, extreme, rain, becoming, increasing, ..."
3,[],[]
4,"[In, Pictures, It, s, one, of, the, largest, r...","[pictures, one, largest, rivers, world, water,..."
6,"[Italy, s, Emilia, Romagna, region, sees, thre...","[italy, romagna, region, sees, three, rivers, ..."
8,"[Yves, here, Yours, truly, has, been, under, r...","[truly, reporting, links, number, severity, ba..."
11,"[Longer, Reads, provide, in, depth, analysis, ...","[longer, reads, provide, depth, analysis, idea..."
13,"[Water, levels, in, many, of, the, rivers, in,...","[water, levels, many, rivers, amazon, basin, r..."
15,"[Read, The, Diplomat, Know, The, Asia, Pacific...","[read, diplomat, know, pacific, following, typ..."
16,"[Plus, Adam, Neumann, s, climate, company, is,...","[plus, neumann, climate, company, issuing, cry..."
19,"[Print, Severe, flooding, in, Chad, since, Jul...","[print, severe, flooding, since, july, claimed..."


In [31]:
df_final = df[['cleaned_content']].copy()
df_final['drop_flag'] = df_final['cleaned_content'].apply(lambda x: 1 if len(x) < 10 else 0)
df_final = df_final[df_final['drop_flag'] == 0]
df_final = df_final.drop(columns=['drop_flag']).reset_index(drop=True)
df_final.to_csv('data/arm_data.csv', index=False)
df_final

Unnamed: 0,cleaned_content
0,"[sommer, extreme, rain, becoming, increasing, ..."
1,"[pictures, one, largest, rivers, world, water,..."
2,"[italy, romagna, region, sees, three, rivers, ..."
3,"[truly, reporting, links, number, severity, ba..."
4,"[longer, reads, provide, depth, analysis, idea..."
5,"[water, levels, many, rivers, amazon, basin, r..."
6,"[read, diplomat, know, pacific, following, typ..."
7,"[plus, neumann, climate, company, issuing, cry..."
8,"[print, severe, flooding, since, july, claimed..."
9,"[print, death, toll, aftermath, typhoon, vietn..."


In [10]:
import pandas as pd
import ast
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

df = pd.read_csv('data/arm_data.csv')

def ARM(chunk, min_support=0.1, min_threshold=0.5):
    # Convert the string representation of lists back into actual lists if needed
    chunk['transactions'] = chunk['cleaned_content'].apply(ast.literal_eval)

    # Prepare the transaction encoder
    te = TransactionEncoder()
    te_ary = te.fit(chunk['transactions']).transform(chunk['transactions'])
    df_chunk = pd.DataFrame(te_ary, columns=te.columns_)

    # Apply the Apriori algorithm to find frequent itemsets
    frequent_itemsets_chunk = apriori(df_chunk, min_support=min_support, use_colnames=True)

    # Generate association rules with a minimum confidence of min_threshold
    rules_chunk = association_rules(frequent_itemsets_chunk, metric="confidence", min_threshold=min_threshold)

    return frequent_itemsets_chunk, rules_chunk

frequent_itemsets, rules = ARM(df, min_support=0.5, min_threshold=0.8)
display(frequent_itemsets)
display(rules)

Unnamed: 0,support,itemsets
0,0.584906,(according)
1,0.603774,(across)
2,0.849057,(also)
3,0.509434,(area)
4,0.679245,(areas)
...,...,...
6741,0.528302,"(people, climate, water, change, river, years,..."
6742,0.509434,"(people, two, water, climate, change, river, y..."
6743,0.509434,"(people, national, climate, water, river, chan..."
6744,0.509434,"(people, two, climate, water, river, change, o..."


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(according),(also),0.584906,0.849057,0.509434,0.870968,1.025806,0.012816,1.169811,0.060606
1,(according),(change),0.584906,0.962264,0.566038,0.967742,1.005693,0.003204,1.169811,0.013636
2,(according),(climate),0.584906,0.943396,0.547170,0.935484,0.991613,-0.004628,0.877358,-0.019969
3,(according),(drinking),0.584906,0.924528,0.566038,0.967742,1.046741,0.025276,2.339623,0.107576
4,(according),(people),0.584906,0.905660,0.547170,0.935484,1.032930,0.017444,1.462264,0.076803
...,...,...,...,...,...,...,...,...,...,...
133044,"(years, since, river)","(people, climate, water, change, one, drinking)",0.584906,0.716981,0.509434,0.870968,1.214771,0.090068,2.193396,0.425926
133045,"(years, since, change)","(people, climate, water, river, one, drinking)",0.584906,0.698113,0.509434,0.870968,1.247602,0.101104,2.339623,0.478114
133046,"(years, one, since)","(people, climate, water, river, change, drinking)",0.547170,0.830189,0.509434,0.931034,1.121473,0.055180,2.462264,0.239198
133047,"(years, drinking, since)","(people, climate, water, river, change, one)",0.584906,0.698113,0.509434,0.870968,1.247602,0.101104,2.339623,0.478114
