In [1]:
import requests
from bs4 import BeautifulSoup

def extract_title_and_text(url):
    """
    Fetches the page at `url` and returns (title, text) of the article.
    If something goes wrong, returns (None, None) or partial content.
    """
    try:
        resp = requests.get(url, headers={
            "User-Agent": "Mozilla/5.0 (compatible; scrapper/1.0)"})
        resp.raise_for_status()
    except Exception as e:
        print("Error fetching:", url, e)
        return None, None

    soup = BeautifulSoup(resp.text, "html.parser")

    title = None
    if soup.title:
        title = soup.title.get_text(strip=True)
    h1 = soup.find("h1")
    if h1:
        title = h1.get_text(strip=True)

    article = soup.find("article")
    if article is None:
        article = soup.find("div", {"role": "article"})
    if article is None:
        article = soup.body

    paragraphs = article.find_all("p")
    text_parts = []
    for p in paragraphs:
        txt = p.get_text(strip=True)
        if txt:
            text_parts.append(txt)

    full_text = "\n\n".join(text_parts)

    return title, full_text


if __name__ == "__main__":
    url = "https://www.yahoo.com/news/tearful-mother-begs-firefighters-let-170134229.html"
    t, text = extract_title_and_text(url)
    print("TITLE:", t)
    print("TEXT:", text)


TITLE: Tearful Mother Begs Firefighters to Let Her onto Scene of California Fireworks Explosion Where Her 3 Sons Worked: ‘I Want to Search for My Kids’
TEXT: Seven people remain missing after an explosion at a fireworks facility in Esparto, Calif., on July 1


“Let us do it,” a tearful Ramos toldToday

A mother who said her three children worked at theCalifornia fireworks facility that was the scene of an explosionis pleading with fire officials to let her help in the search for her loved ones.

Marisol Ramos recently toldTodayon Friday, July 4, that she has not heard from her three sons since the Tuesday, July 1, explosion in Esparto. Seven people were unaccounted for, according to officials.

Authorities said at the time they could not move forward with the search until the scene was safe due to the potential dangers.

Ramos, who immediately went over to the facility on the evening of the disaster, toldTodaythat she hasn’t received any answers since the incident.

“Let us do it,” a t

In [6]:
import pandas as pd
import time 
import tqdm


df = pd.read_csv("news_data/yahoo_strike_6m.csv")
df = df.drop(columns=["Title", "MobileURL"])

df["News_title"] = None
df["News_text"] = None

for  i in tqdm.tqdm(range(len(df))):
    time.sleep(0.5)  
    url = df.loc[i, "URL"]
    title, text = extract_title_and_text(url)
    df.at[i, "News_title"] = title
    df.at[i, "News_text"] = text



df.to_csv("extracted_news/yahoo_strike_6m.csv", index=False)

  0%|          | 1/202 [00:01<04:49,  1.44s/it]

Error fetching: https://www.yahoo.com/news/red-white-blue-festivities-kick-233931820.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/red-white-blue-festivities-kick-233931820.html


  1%|          | 2/202 [00:03<05:28,  1.64s/it]

Error fetching: https://www.yahoo.com/news/private-contractors-offer-trash-removal-233836681.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/private-contractors-offer-trash-removal-233836681.html


  6%|▌         | 12/202 [00:19<04:44,  1.50s/it]

Error fetching: https://www.yahoo.com/news/sixteen-dublin-flights-grounded-air-101743186.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/sixteen-dublin-flights-grounded-air-101743186.html


 13%|█▎        | 27/202 [00:43<04:25,  1.52s/it]

Error fetching: https://www.yahoo.com/news/why-syria-plays-key-role-120028114.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/why-syria-plays-key-role-120028114.html


 18%|█▊        | 36/202 [00:56<04:02,  1.46s/it]

Error fetching: https://www.yahoo.com/news/know-latest-push-gaza-truce-164210717.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/know-latest-push-gaza-truce-164210717.html


 20%|██        | 41/202 [01:03<03:38,  1.36s/it]

Error fetching: https://www.yahoo.com/news/gaza-civil-defence-says-israeli-163055024.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/gaza-civil-defence-says-israeli-163055024.html


 34%|███▎      | 68/202 [01:46<03:21,  1.51s/it]

Error fetching: https://www.yahoo.com/news/ex-ufc-fighter-godofredo-pepey-213318008.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/ex-ufc-fighter-godofredo-pepey-213318008.html


 35%|███▍      | 70/202 [01:49<03:01,  1.37s/it]

Error fetching: https://www.yahoo.com/news/dem-ignites-showdown-declaring-gop-221426282.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/dem-ignites-showdown-declaring-gop-221426282.html


 37%|███▋      | 75/202 [01:57<03:12,  1.51s/it]

Error fetching: https://www.yahoo.com/news/lightning-strikes-tree-causing-topple-223441816.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/lightning-strikes-tree-causing-topple-223441816.html


 38%|███▊      | 77/202 [01:59<02:53,  1.39s/it]

Error fetching: https://www.yahoo.com/news/philly-medical-examiners-office-employees-232041780.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/philly-medical-examiners-office-employees-232041780.html


 39%|███▉      | 79/202 [02:02<02:47,  1.36s/it]

Error fetching: https://www.yahoo.com/news/katy-family-lost-everything-lightning-001750892.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/katy-family-lost-everything-lightning-001750892.html


 46%|████▌     | 92/202 [02:24<02:57,  1.61s/it]

Error fetching: https://www.yahoo.com/news/sudan-war-simple-guide-happening-080012115.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/sudan-war-simple-guide-happening-080012115.html


 50%|█████     | 102/202 [02:41<02:44,  1.64s/it]

Error fetching: https://www.yahoo.com/news/kyiv-hit-barrage-drone-strikes-103604283.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/kyiv-hit-barrage-drone-strikes-103604283.html


 51%|█████▏    | 104/202 [02:45<02:57,  1.81s/it]

Error fetching: https://www.yahoo.com/news/president-trump-honor-iran-strike-100453241.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/president-trump-honor-iran-strike-100453241.html


 52%|█████▏    | 105/202 [02:46<02:31,  1.56s/it]

Error fetching: https://www.yahoo.com/news/woman-says-parents-accused-killing-124509759.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/woman-says-parents-accused-killing-124509759.html


 53%|█████▎    | 108/202 [02:51<02:29,  1.59s/it]

Error fetching: https://www.yahoo.com/news/district-council-33-workers-struck-114253138.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/district-council-33-workers-struck-114253138.html


 54%|█████▍    | 109/202 [02:52<02:27,  1.59s/it]

Error fetching: https://www.yahoo.com/news/philadelphia-strike-ll-cool-j-105507183.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/philadelphia-strike-ll-cool-j-105507183.html


 55%|█████▍    | 111/202 [02:55<02:22,  1.56s/it]

Error fetching: https://www.yahoo.com/news/inside-america-6th-gen-arsenal-140044044.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/inside-america-6th-gen-arsenal-140044044.html


 67%|██████▋   | 135/202 [03:38<01:53,  1.69s/it]

Error fetching: https://www.yahoo.com/news/3-people-hit-lightning-st-172309363.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/3-people-hit-lightning-st-172309363.html


 73%|███████▎  | 148/202 [03:59<01:22,  1.53s/it]

Error fetching: https://www.yahoo.com/news/thousands-turn-celebrate-4th-july-035250625.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/thousands-turn-celebrate-4th-july-035250625.html


 79%|███████▉  | 160/202 [04:20<01:09,  1.65s/it]

Error fetching: https://www.yahoo.com/news/subway-riders-deliver-street-justice-165720543.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/subway-riders-deliver-street-justice-165720543.html


 81%|████████  | 164/202 [04:26<01:01,  1.62s/it]

Error fetching: https://www.yahoo.com/news/trump-using-madman-theory-try-230923920.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/trump-using-madman-theory-try-230923920.html


 88%|████████▊ | 178/202 [04:51<00:39,  1.63s/it]

Error fetching: https://www.yahoo.com/news/iranian-supreme-leader-appears-public-130356927.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/iranian-supreme-leader-appears-public-130356927.html


 91%|█████████ | 184/202 [05:00<00:27,  1.54s/it]

Error fetching: https://www.yahoo.com/news/hamas-security-officer-says-group-161421084.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/hamas-security-officer-says-group-161421084.html


 94%|█████████▍| 190/202 [05:09<00:17,  1.42s/it]

Error fetching: https://www.yahoo.com/news/idf-says-killed-hamas-commander-201333016.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/idf-says-killed-hamas-commander-201333016.html


 96%|█████████▌| 193/202 [05:14<00:14,  1.57s/it]

Error fetching: https://www.yahoo.com/news/dog-two-birds-die-lighting-171851067.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/dog-two-birds-die-lighting-171851067.html


 97%|█████████▋| 195/202 [05:16<00:09,  1.35s/it]

Error fetching: https://www.yahoo.com/news/district-council-33-create-strike-174521846.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/district-council-33-create-strike-174521846.html


 98%|█████████▊| 198/202 [05:22<00:06,  1.60s/it]

Error fetching: https://www.yahoo.com/news/where-trade-talks-stand-rush-010334804.html 404 Client Error: Not Found for url: https://www.yahoo.com/news/where-trade-talks-stand-rush-010334804.html


100%|██████████| 202/202 [05:29<00:00,  1.63s/it]


In [9]:
## read all the csv in extracted_news and remove rows with empty News_text and save back
import os
import pandas as pd
import glob
import tqdm
files = glob.glob("extracted_news/*.csv")
for file in files:
    df = pd.read_csv(file)
    df = df.dropna(subset=["News_text"])
    df.to_csv(file, index=False)
    print(f"Processed {file}, remaining rows: {len(df)}")

Processed extracted_news/yahoo_collapse_6m.csv, remaining rows: 172
Processed extracted_news/yahoo_strike_6m.csv, remaining rows: 174
Processed extracted_news/yahoo_emergency_6m.csv, remaining rows: 174
Processed extracted_news/yahoo_earthquake_6m.csv, remaining rows: 172
Processed extracted_news/yahoo_abrupt_6m_extracted.csv, remaining rows: 183
Processed extracted_news/yahoo_erupt_6m.csv, remaining rows: 38
Processed extracted_news/yahoo_unprecented_6m.csv, remaining rows: 166
Processed extracted_news/yahoo_sinkhole_6m.csv, remaining rows: 50
Processed extracted_news/yahoo_discovery_6m.csv, remaining rows: 175
Processed extracted_news/yahoo_crash_6m.csv, remaining rows: 174
Processed extracted_news/yahoo_outbreak_6m.csv, remaining rows: 172


In [11]:
#randomly sample 3 rows from each csv in extracted_news. combine and save to sampled_news
import os
import pandas as pd
import glob
import tqdm
files = glob.glob("extracted_news/*.csv")
sampled_dfs = []
for file in files:
    df = pd.read_csv(file)
    sampled_df = df.sample(n=3, random_state=42)  # sample 3 rows
    sampled_dfs.append(sampled_df)
combined_sampled_df = pd.concat(sampled_dfs, ignore_index=True)
combined_sampled_df.to_csv("combined_sampled.csv", index=False)