In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
from time import sleep

In [3]:
def scrape_wikipedia_articles(article_titles):
    """
    Scrapes the content of a list of Wikipedia articles and returns the direct links to those articles.
    
    Parameters:
    article_titles (list): A list of Wikipedia article titles to scrape.
    
    Returns:
    dict: A dictionary where the keys are the article titles and the values are the direct links to those articles.
    """
    article_links = {}
    
    for title in tqdm(article_titles, desc="Scraping Wikipedia articles", unit="article"):
        url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        response = requests.get(url)
        
        sleep(0.1) # Be polite to Wikipedia servers
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            article_links[title] = []
            # Find the direct link to the current article title
            for link in soup.find_all("a", href=lambda href: href and href.startswith("/wiki/")):
                list_href = link.get("href").splitlines()
                for href in list_href:
                    if href.split('/')[-1] in article_titles and href.split('/')[-1] != title:
                        article_links[title].append(href.split('/')[-1])
        else:
            print(f"Error scraping {title}: {response.status_code}")
    
    return article_links

def export_to_csv(article_links, output_file):
    """
    Exports the direct links to the Wikipedia articles to a CSV file.
    
    Parameters:
    article_links (dict): A dictionary where the keys are the article titles and the values are the direct links to those articles.
    output_file (str): The path to the output CSV file.
    """
    with open(output_file, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["linkSource", "linkTarget"])
        
        for source, targets in article_links.items():
            if len(targets) > 0:
                for target in targets:
                    writer.writerow([source, target])

In [4]:
url = f"https://en.wikipedia.org/wiki/Zulu"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

In [5]:
DATA_PATH = 'data/wikispeedia_paths-and-graph/'
articles = pd.read_csv(os.path.join(DATA_PATH, 'articles.tsv'), sep='\t', comment='#', names=['article'])
urls = articles['article'].tolist()

In [None]:
article_links = scrape_wikipedia_articles(urls)

Scraping Wikipedia articles:   8%|▊         | 375/4604 [04:39<1:10:26,  1.00article/s]

Error scraping Athletics_%28track_and_field%29: 404


Scraping Wikipedia articles:  10%|█         | 461/4604 [06:11<1:05:36,  1.05article/s]

In [None]:
export_to_csv(article_links, "links2024.csv")

In [None]:
links = pd.read_csv(os.path.join(DATA_PATH, 'links.tsv'), sep='\t', comment='#', names=['linkSource', 'linkTarget'])

# compare the number of links by source target in links2024.csv and links.tsv
links2024 = pd.read_csv('links2024.csv')

# filter to get the same linkSource in links that are in links2024
links = links[links['linkSource'].isin(links2024['linkSource'])]

# Count the number of ouput links by source target
links_count = links.groupby(['linkSource']).size().reset_index(name='count')
links2024_count = links2024.groupby(['linkSource']).size().reset_index(name='count')

# compare the number of links by source target in links2024.csv and links.tsv
comparison = np.abs(links_count['count'] - links2024_count['count'])

In [None]:
print(f"Difference in number of links by article in total between 2007 and now (subset of 500 articles): {comparison.sum() / links.shape[0] * 100:.2f}%")
comparison.describe()