In [1]:
import requests
from bs4 import BeautifulSoup

# Main sitemap URL
main_sitemap_url = "https://www.wired.com/sitemap.xml"
res = requests.get(main_sitemap_url)
soup = BeautifulSoup(res.content, "xml")

# Extract all sitemap <loc> entries
all_sitemaps = [loc.text for loc in soup.find_all("loc") if "sitemap.xml?year=" in loc.text]

# Filter years 2014 to 2024
sitemaps_target = [url for url in all_sitemaps if any(str(y) in url for y in range(2014, 2025))]

print(f"Total yearly sitemap links: {len(sitemaps_target)}")


Total yearly sitemap links: 652


In [11]:
article_links = []

for sitemap_url in sitemaps_target:
    try:
        r = requests.get(sitemap_url, timeout=10)
        xml_soup = BeautifulSoup(r.content, "xml")
        urls = [loc.text for loc in xml_soup.find_all("loc") if "/story/" in loc.text]
        article_links.extend(urls)
        print(f"Extracted {len(urls)} links from {sitemap_url}")
    except Exception as e:
        print(f"Failed: {sitemap_url} - {e}")

# Remove duplicates
article_links = list(set(article_links))

# Save for future use
import pandas as pd
df_links = pd.DataFrame(article_links, columns=["article_url"])
df_links.to_csv("wired_article_links_2014_2024.csv", index=False)
print(f"Saved {len(article_links)} article URLs")


Extracted 13 links from https://www.wired.com/sitemap.xml?year=2024&month=12&week=5
Extracted 37 links from https://www.wired.com/sitemap.xml?year=2024&month=12&week=4
Extracted 60 links from https://www.wired.com/sitemap.xml?year=2024&month=12&week=3
Extracted 68 links from https://www.wired.com/sitemap.xml?year=2024&month=12&week=2
Extracted 114 links from https://www.wired.com/sitemap.xml?year=2024&month=12&week=1
Extracted 10 links from https://www.wired.com/sitemap.xml?year=2024&month=11&week=5
Extracted 54 links from https://www.wired.com/sitemap.xml?year=2024&month=11&week=4
Extracted 55 links from https://www.wired.com/sitemap.xml?year=2024&month=11&week=3
Extracted 51 links from https://www.wired.com/sitemap.xml?year=2024&month=11&week=2
Extracted 73 links from https://www.wired.com/sitemap.xml?year=2024&month=11&week=1
Extracted 33 links from https://www.wired.com/sitemap.xml?year=2024&month=10&week=5
Extracted 46 links from https://www.wired.com/sitemap.xml?year=2024&month=1

In [34]:
import pandas as pd

# read CSV
df = pd.read_csv("wired_article_links_2014_2024.csv")

# demonstrate pre rows data to sure forms
print(df.head())

#total numbers
article_count = len(df)

print(f"Total number of articles: {article_count}")


                                         article_url
0  https://www.wired.com/story/you-need-to-update...
1  https://www.wired.com/story/barclays-finger-sc...
2  https://www.wired.com/story/hey-ev-owners-itd-...
3  https://www.wired.com/story/pretty-good-phone-...
4  https://www.wired.com/story/mastodon-social-ne...
Total number of articles: 42712


In [44]:
pip install openai beautifulsoup4 requests pandas

Note: you may need to restart the kernel to use updated packages.


In [None]:
#test
import pandas as pd
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import time

# original DashScope 
client = OpenAI(
    api_key="sk-6e0150cc451f495fbc336fc688e0de9a",  # DashScope API Key
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)

# add url
df = pd.read_csv("wired_article_links_2014_2024.csv")
urls = df["article_url"].dropna().unique()

# extract text
def extract_text(url):
    try:
        res = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(res.text, "html.parser")
        article = soup.find("article")
        if article:
            paragraphs = article.find_all("p")
            text = "\n".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 10)
            return text
    except:
        return ""
    return ""

# justify if it is ai topic
def is_ai_related(text):
    if not text or len(text) < 100:
        return False  # skip some short and black content

    prompt = f"""Please answer with "yes" or "no" only. Is the following article about artificial intelligence or computing?

{text}"""

    try:
        completion = client.chat.completions.create(
            model="qwen-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        answer = completion.choices[0].message.content.strip().lower()
        return answer.startswith("yes")
    except Exception as e:
        print(f"Error checking AI relevance: {e}")
        return False

# scramp url and mark if it is related
results = []
for i, url in enumerate(urls):
    print(f"{i+1}/{len(urls)} Fetching: {url}")
    text = extract_text(url)
    ai_related = is_ai_related(text)
    results.append({"article_url": url, "is_ai_related": ai_related})
    time.sleep(1)  # avoiad frequency

# save results
df_result = pd.DataFrame(results)
df_result.to_csv("ai_verified_articles_dashscope.csv", index=False)
print("Saved to ai_verified_articles_dashscope.csv")


1/42712 Fetching: https://www.wired.com/story/you-need-to-update-windows-and-chrome-right-now/
2/42712 Fetching: https://www.wired.com/story/barclays-finger-scanner/
3/42712 Fetching: https://www.wired.com/story/hey-ev-owners-itd-take-a-fraction-of-you-to-prop-up-the-grid/
4/42712 Fetching: https://www.wired.com/story/pretty-good-phone-privacy-android/
5/42712 Fetching: https://www.wired.com/story/mastodon-social-network-what-how-create-account/
6/42712 Fetching: https://www.wired.com/story/california-wildfires-photo-gallery/
7/42712 Fetching: https://www.wired.com/story/raspberry-pi-5-million/
8/42712 Fetching: https://www.wired.com/story/facebook-definers-george-soros-qatar-apple/
9/42712 Fetching: https://www.wired.com/story/space-photos-of-the-week-x-rays-binary-stars-mars-moles/
10/42712 Fetching: https://www.wired.com/story/nyc-l-train-shutdown-plan/
11/42712 Fetching: https://www.wired.com/story/bo-play-m5-wireless-speaker/
12/42712 Fetching: https://www.wired.com/story/hunt-roc

In [4]:
#test 2
import pandas as pd
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import time
import os
import csv

# Initialize DashScope client
client = OpenAI(
    api_key="sk-6e0150cc451f495fbc336fc688e0de9a",  # Replace with your DashScope key
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)

input_file = "wired_article_links_2014_2024.csv"
output_file = "ai_verified_articles_dashscope.csv"
error_file = "ai_classification_errors.csv"
log_file = "ai_progress_log.txt"

# Load all URLs
df_all = pd.read_csv(input_file)
all_urls = df_all["article_url"].dropna().unique()

# Load completed results
done_urls = set()
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_urls = set(df_done["article_url"].dropna())

# Load error records to avoid retrying bad URLs
error_urls = set()
if os.path.exists(error_file):
    df_error = pd.read_csv(error_file)
    error_urls = set(df_error["article_url"].dropna())

# Filter remaining URLs
urls_remaining = [u for u in all_urls if u not in done_urls and u not in error_urls]
print("Already completed:", len(done_urls), "| Remaining:", len(urls_remaining))

# Create headers if output file does not exist
if not os.path.exists(output_file):
    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["article_url", "is_ai_related"])
        writer.writeheader()

# Create headers if error file does not exist
if not os.path.exists(error_file):
    with open(error_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["article_url", "error"])
        writer.writeheader()

# Function to extract article text
def extract_text(url):
    try:
        res = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(res.text, "html.parser")
        article = soup.find("article")
        if article:
            paragraphs = article.find_all("p")
            text = "\n".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 10)
            return text
    except:
        return ""
    return ""

# Function to classify using Qwen
def is_ai_related(text):
    if not text or len(text) < 100:
        return False
    prompt = f"""Please answer with "yes" or "no" only. Is the following article about artificial intelligence?\n\n{text}"""
    completion = client.chat.completions.create(
        model="qwen-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    reply = completion.choices[0].message.content.strip().lower()
    return reply.startswith("yes")

# Loop through all remaining articles
for idx, url in enumerate(urls_remaining, start=len(done_urls) + 1):
    print(f"{idx}/{len(all_urls)} Fetching: {url}")
    success = False
    for attempt in range(3):  # Retry up to 3 times
        try:
            text = extract_text(url)
            ai_related = is_ai_related(text)
            with open(output_file, "a", newline="", encoding="utf-8") as f:
                writer = csv.DictWriter(f, fieldnames=["article_url", "is_ai_related"])
                writer.writerow({"article_url": url, "is_ai_related": ai_related})
            with open(log_file, "a", encoding="utf-8") as log:
                log.write(f"{idx}/{len(all_urls)} {url}\n")
            success = True
            break
        except Exception as e:
            print("Retrying after error:", e)
            time.sleep(3)

    if not success:
        with open(error_file, "a", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=["article_url", "error"])
            writer.writerow({"article_url": url, "error": "Failed after 3 attempts"})

    time.sleep(1)


Already completed: 152 | Remaining: 42559
153/42712 Fetching: https://www.wired.com/story/opinion-the-coronavirus-is-democratizing-knowledge/
154/42712 Fetching: https://www.wired.com/story/mars-moons-origins-phobos-deimos/
155/42712 Fetching: https://www.wired.com/story/harry-potter-cursed-child-final-story-rowling/
156/42712 Fetching: https://www.wired.com/story/artificial-intelligence-cancer-detection/
157/42712 Fetching: https://www.wired.com/story/espn-bet-future-disney/
158/42712 Fetching: https://www.wired.com/story/spotify-ai-dj/
159/42712 Fetching: https://www.wired.com/story/what-boston-dynamics-rolling-handle-robot-really-means/
160/42712 Fetching: https://www.wired.com/story/its-time-for-innovators-to-take-responsibility-for-their-creations/
161/42712 Fetching: https://www.wired.com/story/wired-awake-14-august/
162/42712 Fetching: https://www.wired.com/story/how-to-win-a-hot-wheels-derby-on-a-moving-treadmill/
163/42712 Fetching: https://www.wired.com/story/no-more-dlcs/
16

KeyboardInterrupt: 

In [14]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import time
import os

# API setting
base_url = ""
api_key = "your keys"
model = "gemini-2.5-pro"

# input/output
input_excel = "/Users/dengqiuyue/Downloads/final project/test.xlsx"
output_excel = "ai_classification_result.xlsx"

# read Excel
df = pd.read_excel(input_excel)
if "article_url" not in df.columns:
    raise ValueError("Excel must contain an 'article_url' column")

# load
results = []
done_urls = set()
if os.path.exists(output_excel):
    df_saved = pd.read_excel(output_excel)
    done_urls = set(df_saved["article_url"])
    results = df_saved.to_dict("records")

# abstract text function
def extract_text(url):
    try:
        res = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(res.text, "html.parser")
        article = soup.find("article")
        if article:
            ps = article.find_all("p")
            return "\n".join(p.get_text(strip=True) for p in ps if len(p.get_text(strip=True)) > 10)
    except:
        return ""
    return ""

# justify if ai related
def is_ai_related(text):
    if not text or len(text.strip()) < 50:
        return "no"
    prompt = f"""Please answer with only "yes" or "no". Is the following article related to artificial intelligence?\n\n{text}"""
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    }
    try:
        r = requests.post(f"{base_url}/chat/completions", headers=headers, data=json.dumps(payload), timeout=20)
        reply = r.json()["choices"][0]["message"]["content"].strip().lower()
        return "yes" if reply.startswith("yes") else "no"
    except:
        return "no"

# circle process
for i, row in df.iterrows():
    url = row["article_url"]
    if url in done_urls:
        continue

    print(f"{i+1}/{len(df)} Fetching: {url}")
    text = extract_text(url)
    label = is_ai_related(text)

    results.append({"article_url": url, "is_ai_related": label})
    pd.DataFrame(results).to_excel(output_excel, index=False)
    time.sleep(1)



1/27 Fetching: https://www.wired.com/story/you-need-to-update-windows-and-chrome-right-now/
2/27 Fetching: https://www.wired.com/story/barclays-finger-scanner/
3/27 Fetching: https://www.wired.com/story/hey-ev-owners-itd-take-a-fraction-of-you-to-prop-up-the-grid/
4/27 Fetching: https://www.wired.com/story/pretty-good-phone-privacy-android/
5/27 Fetching: https://www.wired.com/story/mastodon-social-network-what-how-create-account/
6/27 Fetching: https://www.wired.com/story/california-wildfires-photo-gallery/
7/27 Fetching: https://www.wired.com/story/raspberry-pi-5-million/
8/27 Fetching: https://www.wired.com/story/facebook-definers-george-soros-qatar-apple/
9/27 Fetching: https://www.wired.com/story/space-photos-of-the-week-x-rays-binary-stars-mars-moles/
10/27 Fetching: https://www.wired.com/story/nyc-l-train-shutdown-plan/
11/27 Fetching: https://www.wired.com/story/bo-play-m5-wireless-speaker/
12/27 Fetching: https://www.wired.com/story/hunt-rocket-boosters/
13/27 Fetching: https:

In [None]:
#sucess
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import gc
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Use current working directory instead of __file__
script_dir = os.getcwd()
os.chdir(script_dir)  # Optional: ensures relative paths behave consistently


# Load article URLs
csv_file = os.path.join(script_dir, "wired_article_links_2014_2024.csv")
if not os.path.exists(csv_file):
    print(f"Error: Cannot find {csv_file}")
    print(f"Current directory: {os.getcwd()}")
    print(f"Script directory: {script_dir}")
    exit(1)

df = pd.read_csv(csv_file)
urls = df["article_url"].dropna().unique()
print(f"Loaded {len(urls)} unique URLs from CSV file")

# Load previous progress if exists
progress_file = os.path.join(script_dir, "temp_articles_with_images.csv")
final_output = os.path.join(script_dir, "wired_articles_with_images_only.csv")
checked_set = set()

# Check already processed articles
if os.path.exists(final_output):
    df_existing = pd.read_csv(final_output)
    checked_set = set(df_existing["article_url"])
    print(f"Found existing results: {len(checked_set)} URLs already processed.")
elif os.path.exists(progress_file):
    df_prev = pd.read_csv(progress_file)
    checked_set = set(df_prev["article_url"])
    print(f"Resuming from previous session: {len(checked_set)} URLs already checked.")

# Function to check if an article contains an image
def article_contains_image(url):
    try:
        response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(response.text, "html.parser")
        article = soup.find("article")
        if not article:
            return url, False
        # Only check for the first image, return True immediately if found
        image = article.find("img")
        # Explicitly close the response and release memory
        response.close()
        del soup, response
        gc.collect()  # Force garbage collection
        return url, image is not None
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return url, False

# Multithreading parameters
MAX_WORKERS = 20  # Number of concurrent threads
BATCH_SIZE = 100  # Number of URLs per batch
batch_results = []
results_lock = threading.Lock()  # Thread lock to protect shared resources

def process_batch(batch_urls, batch_start_index):
    """Function to process a batch of URLs"""
    batch_results_local = []
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks, with index tracking
        future_to_info = {}
        for idx, url in enumerate(batch_urls):
            global_idx = batch_start_index + idx
            future = executor.submit(article_contains_image, url)
            future_to_info[future] = (url, global_idx)
        
        # Collect results
        for future in as_completed(future_to_info):
            url, global_idx = future_to_info[future]
            _, has_image = future.result()
            if has_image:
                batch_results_local.append({"article_url": url})
            print(f"[{global_idx+1}/{len(urls_to_process)}] Processed: {url} - Has image: {has_image}")
    
    return batch_results_local

# Filter URLs that haven't been processed yet
urls_to_process = [url for url in urls if url not in checked_set]
print(f"URLs to process: {len(urls_to_process)}")

# Process URLs in batches
for i in range(0, len(urls_to_process), BATCH_SIZE):
    batch_urls = urls_to_process[i:i + BATCH_SIZE]
    batch_num = i // BATCH_SIZE + 1
    total_batches = (len(urls_to_process) + BATCH_SIZE - 1) // BATCH_SIZE
    
    print(f"\nProcessing batch {batch_num}/{total_batches} ({len(batch_urls)} URLs)")
    
    # Process current batch with multithreading
    batch_start_index = i
    batch_results_local = process_batch(batch_urls, batch_start_index)
    
    # Save batch results
    if batch_results_local:
        with results_lock:
            # Save to temporary progress file
            mode = 'a' if os.path.exists(progress_file) else 'w'
            header = not os.path.exists(progress_file)
            
            df_batch = pd.DataFrame(batch_results_local)
            df_batch.to_csv(progress_file, mode=mode, header=header, index=False)
            
            print(f"Saved {len(batch_results_local)} results from batch {batch_num}")
            
        # Force garbage collection
        gc.collect()
    
    # Short sleep between batches to avoid overloading
    time.sleep(1)

# After processing, rename the temp file to final output
if os.path.exists(progress_file):
    if os.path.exists(final_output):
        os.remove(final_output)  # Delete old final file
    os.rename(progress_file, final_output)
    print(f"Processing completed. Progress file renamed to {final_output}")
else:
    print("No results found.")

print(f"Finished. Results saved to {final_output}")


Loaded 42712 unique URLs from CSV file
URLs to process: 42712

Processing batch 1/428 (100 URLs)
[9/42712] Processed: https://www.wired.com/story/space-photos-of-the-week-x-rays-binary-stars-mars-moles/ - Has image: True
[15/42712] Processed: https://www.wired.com/story/pewdiepie-net-worth-antisemitic-video-disney/ - Has image: True
[3/42712] Processed: https://www.wired.com/story/hey-ev-owners-itd-take-a-fraction-of-you-to-prop-up-the-grid/ - Has image: True
[7/42712] Processed: https://www.wired.com/story/raspberry-pi-5-million/ - Has image: False
[18/42712] Processed: https://www.wired.com/story/mama-pacha-chocolate-oaxaca/ - Has image: True
[11/42712] Processed: https://www.wired.com/story/bo-play-m5-wireless-speaker/ - Has image: False
[2/42712] Processed: https://www.wired.com/story/barclays-finger-scanner/ - Has image: False
[16/42712] Processed: https://www.wired.com/story/nasa-will-send-a-helicopter-to-hunt-for-life-on-saturns-moon-titan/ - Has image: True
[8/42712] Processed:

In [55]:
import pandas as pd

# Load structured article data
df = pd.read_csv("wired_ai_cs_articles_full_with_images.csv")

# condition,at least one pictre
df_with_images = df[df["num_images"] > 0]

# output
print(f"Total articles with images: {len(df_with_images)}")

# Optional: Display the first few URL examples
print("\nSample article URLs with images:")
print(df_with_images["url"].dropna().head(5).to_string(index=False))



Total articles with images: 6856

Sample article URLs with images:
https://www.wired.com/story/tesla-federal-tax-c...
https://www.wired.com/story/chatgpt-plugins-ope...
https://www.wired.com/story/tesla-model-3-more-...
https://www.wired.com/story/bitcoin-fog-roman-s...
https://www.wired.com/story/facebook-new-ai-tea...


In [59]:
import pandas as pd

# Load structured results
df = pd.read_csv("wired_ai_cs_articles_full_with_images.csv")

# Filter: at least 1 image, valid text, and valid date
df_filtered = df[
    (df["num_images"] > 0) &
    (df["text"].notnull()) & (df["text"].str.strip() != "") &
    (df["date"].notnull()) & (df["date"].str.strip() != "")
].reset_index(drop=True)

# Save filtered results
df_filtered.to_csv("wired_ai_cs_articles_with_images_only.csv", index=False)
print(f"Saved {len(df_filtered)} articles with images to wired_ai_cs_articles_with_images_only.csv")


Saved 6856 articles with images to wired_ai_cs_articles_with_images_only.csv
