In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
from time import sleep

In [2]:
def scrape_wikipedia_articles(article_titles):
    """
    Scrapes the content of a list of Wikipedia articles and returns the direct links to those articles.
    
    Parameters:
    article_titles (list): A list of Wikipedia article titles to scrape.
    
    Returns:
    dict: A dictionary where the keys are the article titles and the values are the direct links to those articles.
    """
    article_links = {}
    article_names = []
    
    for title in tqdm(article_titles, desc="Scraping Wikipedia articles", unit="article"):
        url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        response = requests.get(url)
        
        sleep(0.1) # Be polite to Wikipedia servers
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            article_links[title] = []
            article_names.append(title)
            
            
            # Find the direct link to the current article title
            for link in soup.find_all("a", href=lambda href: href and href.startswith("/wiki/")):
                list_href = link.get("href").splitlines()
                for href in list_href:
                    if href.split('/')[-1] in article_titles and href.split('/')[-1] != title:
                        article_links[title].append(href.split('/')[-1])
        else:
            print(f"Error scraping {title}: {response.status_code}")
    
    return article_links, article_names


def write_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        writer.writerow(['# Hierarchical categories of all articles.'])
        writer.writerow(['# Many articles have more than one category. Some articles have no category.'])
        writer.writerow(['# Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").'])
        writer.writerow(['# FORMAT:   article   category'])
        writer.writerow(['#'])
        writer.writerows(data)


def export_links_to_csv(article_links, output_file):
    """
    Exports the direct links to the Wikipedia articles to a CSV file.
    
    Parameters:
    article_links (dict): A dictionary where the keys are the article titles and the values are the direct links to those articles.
    output_file (str): The path to the output CSV file.
    """
    with open(output_file, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["linkSource", "linkTarget"])
        
        for source, targets in article_links.items():
            if len(targets) > 0:
                for target in targets:
                    writer.writerow([source, target])

def export_articles_to_csv(articles, output_file):
    """
    Exports the direct links to the Wikipedia articles to a CSV file.
    
    Parameters:
    article_links (dict): A dictionary where the keys are the article titles and the values are the direct links to those articles.
    output_file (str): The path to the output CSV file.
    """
    with open(output_file, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article"])
        
        for article in articles:
            writer.writerow([article])



# def export_categories_to_csv(article_categories, output_file):
#     """
#     Exports the direct links to the Wikipedia articles to a CSV file.
    
#     Parameters:
#     article_links (dict): A dictionary where the keys are the article titles and the values are the direct links to those articles.
#     output_file (str): The path to the output CSV file.
#     """
#     with open(output_file, "w", newline="") as csvfile:
#         writer = csv.writer(csvfile)
#         writer.writerow(["article", "category"])
        
#         for article, category in article_categories.items():
#             writer.writerow([article, category])
        

In [3]:
url = f"https://en.wikipedia.org/wiki/Zulu"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

In [5]:
DATA_PATH = 'data/2007/'
articles = pd.read_csv(os.path.join(DATA_PATH, 'articles.tsv'), sep='\t', comment='#', names=['article'])
urls = articles['article'].tolist()

In [6]:
article_links, article_names = scrape_wikipedia_articles(urls)

Scraping Wikipedia articles:   8%|▊         | 375/4604 [04:31<55:49,  1.26article/s]  

Error scraping Athletics_%28track_and_field%29: 404


Scraping Wikipedia articles:  13%|█▎        | 588/4604 [07:17<53:14,  1.26article/s]  

Error scraping Bionicle__Mask_of_Light: 404


Scraping Wikipedia articles:  26%|██▋       | 1211/4604 [15:13<40:52,  1.38article/s]  

Error scraping Directdebit: 404


Scraping Wikipedia articles:  35%|███▍      | 1601/4604 [20:10<38:57,  1.28article/s]  

Error scraping Friend_Directdebit: 404


Scraping Wikipedia articles:  35%|███▌      | 1628/4604 [20:25<32:53,  1.51article/s]

Error scraping Gallery_of_the_Kings_and_Queens_of_England: 404


Scraping Wikipedia articles:  65%|██████▍   | 2970/4604 [37:35<21:53,  1.24article/s]  

Error scraping Newshounds: 404


Scraping Wikipedia articles:  84%|████████▎ | 3850/4604 [48:22<08:08,  1.54article/s]  

Error scraping Sponsorship_Directdebit: 404


Scraping Wikipedia articles:  84%|████████▍ | 3879/4604 [48:45<12:01,  1.00article/s]

Error scraping Star_Wars_Episode_IV__A_New_Hope: 404


Scraping Wikipedia articles:  97%|█████████▋| 4481/4604 [56:15<01:47,  1.14article/s]

Error scraping Wikipedia_Text_of_the_GNU_Free_Documentation_License: 404


Scraping Wikipedia articles:  99%|█████████▊| 4546/4604 [56:59<00:38,  1.49article/s]

Error scraping Wowpurchase: 404


Scraping Wikipedia articles:  99%|█████████▉| 4553/4604 [57:04<00:35,  1.45article/s]

Error scraping X-Men__The_Last_Stand: 404


Scraping Wikipedia articles: 100%|██████████| 4604/4604 [57:40<00:00,  1.33article/s]


In [7]:
export_links_to_csv(article_links, "data/2024/links2024.csv")
export_articles_to_csv(article_names, "data/2024/articles2024.csv")

In [8]:
links = pd.read_csv(os.path.join(DATA_PATH, 'links.tsv'), sep='\t', comment='#', names=['linkSource', 'linkTarget'])

# compare the number of links by source target in links2024.csv and links.tsv
links2024 = pd.read_csv('data/2024/links2024.csv')

# filter to get the same linkSource in links that are in links2024
links = links[links['linkSource'].isin(links2024['linkSource'])]

# Count the number of ouput links by source target
links_count = links.groupby(['linkSource']).size().reset_index(name='count')
links2024_count = links2024.groupby(['linkSource']).size().reset_index(name='count')

# compare the number of links by source target in links2024.csv and links.tsv
comparison = np.abs(links_count['count'] - links2024_count['count'])

In [9]:
print(f"Difference in number of links by article in total between 2007 and now (subset of 500 articles): {comparison.sum() / links.shape[0] * 100:.2f}%")
comparison.describe()

Difference in number of links by article in total between 2007 and now (subset of 500 articles): 251.97%


count    4535.000000
mean       66.196913
std        88.924844
min         0.000000
25%        12.000000
50%        33.000000
75%        79.000000
max      1374.000000
Name: count, dtype: float64

In [10]:
links2024.shape

(377149, 2)

In [12]:
len(article_names)

4593

In [13]:
articles.shape

(4604, 1)