In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
from time import sleep
import urllib.parse

In [10]:
# def scrape_wikipedia_articles(article_titles):
#     """
#     Scrapes the content of a list of Wikipedia articles and returns the direct links to those articles.
    
#     Parameters:
#     article_titles (list): A list of Wikipedia article titles to scrape.
    
#     Returns:
#     dict: A dictionary where the keys are the article titles and the values are the direct links to those articles.
#     """
#     article_links = {}
    
#     for title in tqdm(article_titles, desc="Scraping Wikipedia articles", unit="article"):
#         url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
#         response = requests.get(url)
        
#         sleep(0.1) # Be polite to Wikipedia servers
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, "html.parser")
#             article_links[title] = []
#             # Find the direct link to the current article title
#             for link in soup.find_all("a", href=lambda href: href and href.startswith("/wiki/")):
#                 list_href = link.get("href").splitlines()
#                 for href in list_href:
#                     if href.split('/')[-1] in article_titles and href.split('/')[-1] != title:
#                         article_links[title].append(href.split('/')[-1])
#         else:
#             print(f"Error scraping {title}: {response.status_code}")
    
#     return article_links


def scrape_wikipedia_articles(article_titles):
    article_links = {}
    article_names = []
    article_categories = {}
    
    for title in tqdm(article_titles, desc="Scraping Wikipedia articles", unit="article"):
        url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        response = requests.get(url)
        
        sleep(0.1)  # Be polite to Wikipedia servers
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            article_links[title] = []
            article_names.append(title)
            
            # Extract categories
            category_links = soup.find_all("a", {"class": "category-link"})
            article_categories[title] = [link.text for link in category_links]
            
            # Find direct links to other articles
            for link in soup.find_all("a", href=lambda href: href and href.startswith("/wiki/")):
                list_href = link.get("href").splitlines()
                for href in list_href:
                    linked_article = href.split('/')[-1]
                    if linked_article in article_titles and linked_article != title:
                        article_links[title].append(linked_article)
        else:
            print(f"Error scraping {title}: {response.status_code}")
    
    # Create a DataFrame for articles and their categories
    df_categories = pd.DataFrame.from_dict(article_categories, orient='index').reset_index()
    df_categories.columns = ['Article'] + [f'Category_{i+1}' for i in range(df_categories.shape[1]-1)]
    
    return article_links, article_names, df_categories


def write_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        writer.writerow(['# Hierarchical categories of all articles.'])
        writer.writerow(['# Many articles have more than one category. Some articles have no category.'])
        writer.writerow(['# Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").'])
        writer.writerow(['# FORMAT:   article   category'])
        writer.writerow(['#'])
        writer.writerows(data)


def export_links_to_csv(article_links, output_file):
    """
    Exports the direct links to the Wikipedia articles to a CSV file.
    
    Parameters:
    article_links (dict): A dictionary where the keys are the article titles and the values are the direct links to those articles.
    output_file (str): The path to the output CSV file.
    """
    with open(output_file, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["linkSource", "linkTarget"])
        
        for source, targets in article_links.items():
            if len(targets) > 0:
                for target in targets:
                    writer.writerow([source, target])

def export_articles_to_csv(articles, output_file):
    """
    Exports the direct links to the Wikipedia articles to a CSV file.
    
    Parameters:
    article_links (dict): A dictionary where the keys are the article titles and the values are the direct links to those articles.
    output_file (str): The path to the output CSV file.
    """
    with open(output_file, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article"])
        
        for article in articles:
            writer.writerow([article])

def export_categories_to_csv(article_categories, output_file):
    """
    Exports the direct links to the Wikipedia articles to a CSV file.
    
    Parameters:
    article_links (dict): A dictionary where the keys are the article titles and the values are the direct links to those articles.
    output_file (str): The path to the output CSV file.
    """
    with open(output_file, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article", "category"])
        
        for article, category in article_categories.items():
            writer.writerow([article, category])
        

In [9]:
# def process_categories(categories):
#     """Process categories into a hierarchical format."""
#     processed = []
#     for category in categories:
#         parts = category.split(':')
#         if len(parts) > 1:
#             hierarchy = ['subject'] + parts
#         else:
#             hierarchy = ['subject', category]
#         processed.append('.'.join(hierarchy))
#     return processed

# def scrape_wikipedia_articles(article_titles):
#     article_data = []
    
#     for title in tqdm(article_titles, desc="Scraping Wikipedia articles", unit="article"):
#         url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
#         response = requests.get(url)
        
#         sleep(0.1)  # Be polite to Wikipedia servers
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, "html.parser")
            
#             # Extract categories
#             category_links = soup.find_all("a", {"class": "category-link"})
#             categories = [link.text for link in category_links]
#             processed_categories = process_categories(categories)
            
#             # URL encode the article title
#             encoded_title = urllib.parse.quote(title)
            
#             # Add data for each category
#             for category in processed_categories:
#                 article_data.append((encoded_title, category))
#         else:
#             print(f"Error scraping {title}: {response.status_code}")
    
#     return article_data

# def write_to_csv(data, filename):
#     with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
#         writer = csv.writer(csvfile, delimiter='\t')
#         writer.writerow(['# Hierarchical categories of all articles.'])
#         writer.writerow(['# Many articles have more than one category. Some articles have no category.'])
#         writer.writerow(['# Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").'])
#         writer.writerow(['# FORMAT:   article   category'])
#         writer.writerow(['#'])
#         writer.writerows(data)


# article_titles = ["Python_(programming_language)", "Machine_learning", "Artificial_intelligence"]
# scraped_data = scrape_wikipedia_articles(article_titles)
# write_to_csv(scraped_data, 'wikipedia_categories.csv')

Scraping Wikipedia articles:   0%|          | 0/3 [00:00<?, ?article/s]

Scraping Wikipedia articles: 100%|██████████| 3/3 [00:02<00:00,  1.10article/s]


In [3]:
url = f"https://en.wikipedia.org/wiki/Zulu"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

In [4]:
DATA_PATH = 'data/wikispeedia_paths-and-graph/'
articles = pd.read_csv(os.path.join(DATA_PATH, 'articles.tsv'), sep='\t', comment='#', names=['article'])
urls = articles['article'].tolist()[:50]

In [5]:
article_data = scrape_wikipedia_articles(urls)

Scraping Wikipedia articles: 100%|██████████| 50/50 [00:21<00:00,  2.28article/s]


In [6]:
article_data

[]

In [34]:
len(article_links.keys())

50

In [35]:
export_links_to_csv(article_links, "links2024.csv")

In [36]:
export_articles_to_csv(article_names, "articles2024.csv")

In [38]:
export_categories_to_csv(df_categories, "categories2024.csv")

In [14]:
links = pd.read_csv(os.path.join(DATA_PATH, 'links.tsv'), sep='\t', comment='#', names=['linkSource', 'linkTarget'])

# compare the number of links by source target in links2024.csv and links.tsv
links2024 = pd.read_csv('links2024.csv')

# filter to get the same linkSource in links that are in links2024
links = links[links['linkSource'].isin(links2024['linkSource'])]

# Count the number of ouput links by source target
links_count = links.groupby(['linkSource']).size().reset_index(name='count')
links2024_count = links2024.groupby(['linkSource']).size().reset_index(name='count')

# compare the number of links by source target in links2024.csv and links.tsv
comparison = np.abs(links_count['count'] - links2024_count['count'])

In [15]:
print(f"Difference in number of links by article in total between 2007 and now (subset of 500 articles): {comparison.sum() / links.shape[0] * 100:.2f}%")
comparison.describe()

Difference in number of links by article in total between 2007 and now (subset of 500 articles): 62.54%


count     63.000000
mean      21.095238
std       26.698263
min        0.000000
25%        6.000000
50%       14.000000
75%       20.500000
max      136.000000
Name: count, dtype: float64