In [None]:
import requests
from bs4 import BeautifulSoup

# Main sitemap URL
main_sitemap_url = "https://www.wired.com/sitemap.xml"
res = requests.get(main_sitemap_url)
soup = BeautifulSoup(res.content, "xml")

# Extract all sitemap <loc> entries
all_sitemaps = [loc.text for loc in soup.find_all("loc") if "sitemap.xml?year=" in loc.text]

# Filter years 2014 to 2024
sitemaps_target = [url for url in all_sitemaps if any(str(y) in url for y in range(2014, 2025))]

print(f"Total yearly sitemap links: {len(sitemaps_target)}")


In [None]:
article_links = []

for sitemap_url in sitemaps_target:
    try:
        r = requests.get(sitemap_url, timeout=10)
        xml_soup = BeautifulSoup(r.content, "xml")
        urls = [loc.text for loc in xml_soup.find_all("loc") if "/story/" in loc.text]
        article_links.extend(urls)
        print(f"Extracted {len(urls)} links from {sitemap_url}")
    except Exception as e:
        print(f"Failed: {sitemap_url} - {e}")

# Remove duplicates
article_links = list(set(article_links))

# Save for future use
import pandas as pd
df_links = pd.DataFrame(article_links, columns=["article_url"])
df_links.to_csv("wired_article_links_2014_2024.csv", index=False)
print(f"Saved {len(article_links)} article URLs")


In [None]:
pip install openai beautifulsoup4 requests pandas

In [None]:
#justify images
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import gc
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Use current working directory instead of __file__
script_dir = os.getcwd()
os.chdir(script_dir)  # Optional: ensures relative paths behave consistently


# Load article URLs
csv_file = os.path.join(script_dir, "wired_article_links_2014_2024.xlxs")
if not os.path.exists(csv_file):
    print(f"Error: Cannot find {csv_file}")
    print(f"Current directory: {os.getcwd()}")
    print(f"Script directory: {script_dir}")
    exit(1)

df = pd.read_csv(csv_file)
urls = df["article_url"].dropna().unique()
print(f"Loaded {len(urls)} unique URLs from CSV file")

# Load previous progress if exists
progress_file = os.path.join(script_dir, "temp_articles_with_images.xlxs")
final_output = os.path.join(script_dir, "wired_articles_with_images_only.xlxs")
checked_set = set()

# Check already processed articles
if os.path.exists(final_output):
    df_existing = pd.read_csv(final_output)
    checked_set = set(df_existing["article_url"])
    print(f"Found existing results: {len(checked_set)} URLs already processed.")
elif os.path.exists(progress_file):
    df_prev = pd.read_csv(progress_file)
    checked_set = set(df_prev["article_url"])
    print(f"Resuming from previous session: {len(checked_set)} URLs already checked.")

# Function to check if an article contains an image
def article_contains_image(url):
    try:
        response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(response.text, "html.parser")
        article = soup.find("article")
        if not article:
            return url, False
        # Only check for the first image, return True immediately if found
        image = article.find("img")
        # Explicitly close the response and release memory
        response.close()
        del soup, response
        gc.collect()  # Force garbage collection
        return url, image is not None
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return url, False

# Multithreading parameters
MAX_WORKERS = 20  # Number of concurrent threads
BATCH_SIZE = 100  # Number of URLs per batch
batch_results = []
results_lock = threading.Lock()  # Thread lock to protect shared resources

def process_batch(batch_urls, batch_start_index):
    """Function to process a batch of URLs"""
    batch_results_local = []
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks, with index tracking
        future_to_info = {}
        for idx, url in enumerate(batch_urls):
            global_idx = batch_start_index + idx
            future = executor.submit(article_contains_image, url)
            future_to_info[future] = (url, global_idx)
        
        # Collect results
        for future in as_completed(future_to_info):
            url, global_idx = future_to_info[future]
            _, has_image = future.result()
            if has_image:
                batch_results_local.append({"article_url": url})
            print(f"[{global_idx+1}/{len(urls_to_process)}] Processed: {url} - Has image: {has_image}")
    
    return batch_results_local

# Filter URLs that haven't been processed yet
urls_to_process = [url for url in urls if url not in checked_set]
print(f"URLs to process: {len(urls_to_process)}")

# Process URLs in batches
for i in range(0, len(urls_to_process), BATCH_SIZE):
    batch_urls = urls_to_process[i:i + BATCH_SIZE]
    batch_num = i // BATCH_SIZE + 1
    total_batches = (len(urls_to_process) + BATCH_SIZE - 1) // BATCH_SIZE
    
    print(f"\nProcessing batch {batch_num}/{total_batches} ({len(batch_urls)} URLs)")
    
    # Process current batch with multithreading
    batch_start_index = i
    batch_results_local = process_batch(batch_urls, batch_start_index)
    
    # Save batch results
    if batch_results_local:
        with results_lock:
            # Save to temporary progress file
            mode = 'a' if os.path.exists(progress_file) else 'w'
            header = not os.path.exists(progress_file)
            
            df_batch = pd.DataFrame(batch_results_local)
            df_batch.to_csv(progress_file, mode=mode, header=header, index=False)
            
            print(f"Saved {len(batch_results_local)} results from batch {batch_num}")
            
        # Force garbage collection
        gc.collect()
    
    # Short sleep between batches to avoid overloading
    time.sleep(1)

# After processing, rename the temp file to final output
if os.path.exists(progress_file):
    if os.path.exists(final_output):
        os.remove(final_output)  # Delete old final file
    os.rename(progress_file, final_output)
    print(f"Processing completed. Progress file renamed to {final_output}")
else:
    print("No results found.")

print(f"Finished. Results saved to {final_output}")

In [None]:
import pandas as pd

# Load structured results
df = pd.read_csv("wired_ai_cs_articles_full_with_images.xlxs")

# Filter: at least 1 image, valid text, and valid date
df_filtered = df[
    (df["num_images"] > 0) &
    (df["text"].notnull()) & (df["text"].str.strip() != "") &
    (df["date"].notnull()) & (df["date"].str.strip() != "")
].reset_index(drop=True)

# Save filtered results
df_filtered.to_csv("wired_ai_cs_articles_with_images_only.xlxs", index=False)
print(f"Saved {len(df_filtered)} articles with images to wired_ai_cs_articles_with_images_only.xlxs")


In [None]:
#use YOLO
import os
import pandas as pd
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# === Step 1: Load Excel ===
input_path = "/Users/dengqiuyue/Downloads/final project/scraping/final_merged_metadata_with_labels.xlsx"
df = pd.read_excel(input_path)
df["local_path"] = ""

# === Step 2: Prepare base folder ===
base_folder = "final project code/image1"
os.makedirs(base_folder, exist_ok=True)

# === Step 3: Clean article_url to valid folder path
def safe_folder_name(article_url):
    return (
        str(article_url)
        .replace("https://", "https___")
        .replace("http://", "http___")
        .replace("/", "_")
        .replace(":", "_")
        .strip()
    )

# === Step 4: Download function
def download_image(row):
    try:
        url = str(row["image_path"]).strip()
        folder_name = safe_folder_name(row["article_id"])
        image_id = str(row["image_id"]).replace("/", "_")

        folder = os.path.join(base_folder, folder_name)
        os.makedirs(folder, exist_ok=True)
        save_path = os.path.join(folder, f"{image_id}.jpg")

        if not os.path.exists(save_path):
            headers = {"User-Agent": "Mozilla/5.0"}
            r = requests.get(url, headers=headers, timeout=10)
            r.raise_for_status()
            with open(save_path, "wb") as f:
                f.write(r.content)

        return save_path
    except Exception as e:
        return "download_error"

# === Step 5: Multithreaded execution ===
def process(index_row):
    idx, row = index_row
    path = download_image(row)
    return idx, path

with ThreadPoolExecutor(max_workers=20) as executor:
    results = list(tqdm(executor.map(process, df.iterrows()), total=len(df), desc="Downloading images"))
    for idx, path in results:
        df.at[idx, "local_path"] = path

# === Step 6: Save updated Excel ===
output_path = "image1_metadata_with_local_paths.xlsx"
df.to_excel(output_path, index=False)
print(f"Saved: {output_path}")


In [None]:
#ai related
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import logging

# Configuration
base_url = ""
api_key = "your api key"
model = "Qwen/Qwen3-8B"

input_file = "wired_articles_with_images_only.xlxs"
output_file = "ai2_classification_qwen1.xlxs"
temp_file = "temp_qwen_batch.xlxs"
fail_log = "fail_log_qwen.xlxs"
batch_size = 10
max_workers = 2

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load input
df = pd.read_csv(input_file)
if "article_url" not in df.columns:
    raise ValueError("Input file must contain 'article_url' column")
urls = df["article_url"].dropna().unique().tolist()

# Load existing progress
done_urls = set()
results = []

if os.path.exists(output_file):
    try:
        df_done = pd.read_csv(output_file)
        if "article_url" in df_done.columns:
            done_urls.update(df_done["article_url"])
            results.extend(df_done.to_dict("records"))
            logging.info(f"Loaded {len(df_done)} from output file.")
    except Exception as e:
        logging.warning(f"Failed to load output_file: {e}")

if os.path.exists(temp_file):
    try:
        df_temp = pd.read_csv(temp_file)
        if "article_url" in df_temp.columns:
            done_urls.update(df_temp["article_url"])
            results.extend(df_temp.to_dict("records"))
            logging.info(f"Loaded {len(df_temp)} from temp file.")
    except Exception as e:
        logging.warning(f"Failed to load temp_file: {e}")

# Load previous failures
failure_records = []
if os.path.exists(fail_log):
    try:
        df_fail = pd.read_csv(fail_log)
        failure_records = df_fail.to_dict("records")
    except:
        pass

# Extract text
def extract_text(url):
    try:
        res = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(res.text, "html.parser")
        article = soup.find("article") or soup.find("main") or soup.find("body")
        if not article:
            return ""
        paragraphs = article.find_all("p")
        return "\n".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 10)
    except Exception as e:
        logging.warning(f"Failed to extract text from {url}: {e}")
        return ""

# AI classification
def is_ai_related(text):
    if not text or len(text.strip()) < 50:
        return "no_text"

    prompt = f"""Only answer in this exact format:

JUDGMENT: [true/false]

Answer "true" if the article is in any way related to artificial intelligence (AI), including:
- AI technologies (e.g., machine learning, neural networks, natural language processing),
- AI-enabled applications (e.g., personalized systems, autonomous vehicles),
- AI companies, people, ethics, research, or market trends.

Answer "false" if the article is not related to AI.

Article text:
{text}"""

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
        "Accept": "application/json"
    }

    payload = json.dumps({
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    })

    try:
        response = requests.post(base_url + "/chat/completions", headers=headers, data=payload, timeout=30)
        if "application/json" not in response.headers.get("Content-Type", ""):
            raise ValueError("Non-JSON response")

        data = response.json()
        if "choices" not in data:
            logging.error(f"Unexpected response: {json.dumps(data, indent=2)}")
            return "api_error"

        full_response = data["choices"][0]["message"]["content"].strip()
        for line in full_response.split('\n'):
            if line.lower().startswith("judgment:"):
                judgment = line.split(":", 1)[1].strip().lower()
                return "true" if judgment.startswith("true") else "false"

        return "true" if full_response.lower().startswith("true") else "false"

    except Exception as e:
        logging.error(f"API ERROR for text length {len(text)}: {e}")
        try:
            logging.error(f"Response text: {response.text[:300]}")
        except:
            pass
        return "api_error"

# URL processing
def process_url(url):
    if url in done_urls:
        return None

    text = extract_text(url)
    if not text:
        failure_records.append({"article_url": url, "reason": "no_text"})
        return None

    label = is_ai_related(text)
    if label == "api_error":
        failure_records.append({"article_url": url, "reason": "api_error"})
        return None
    elif label == "no_text":
        failure_records.append({"article_url": url, "reason": "too_short"})
        return None
    else:
        return {"article_url": url, "is_ai_related": label, "text_length": len(text)}

# Multithreaded processing
new_results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(process_url, url): url for url in urls if url not in done_urls}
    for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Processing")):
        try:
            result = future.result()
            if result:
                logging.info(f"[{i+1}] {result['is_ai_related'].upper()} <- {result['article_url']}")
                new_results.append(result)
        except Exception as e:
            logging.error(f"Thread error: {e}")

        if len(new_results) >= batch_size:
            pd.DataFrame(new_results).to_csv(temp_file, mode='a', header=not os.path.exists(temp_file), index=False)
            results.extend(new_results)
            new_results = []
            time.sleep(1)

# Final flush
if new_results:
    pd.DataFrame(new_results).to_csv(temp_file, mode='a', header=not os.path.exists(temp_file), index=False)
    results.extend(new_results)

# Final save
pd.DataFrame(results).to_csv(output_file, index=False)
if failure_records:
    pd.DataFrame(failure_records).to_csv(fail_log, index=False)

logging.info("All processing completed.")