In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
from time import sleep

In [None]:
import src.scripts.scrapper_and_writters as scr

## Scrapping articles from the articles names of Wikispeedia 2007

In [3]:
url = f"https://en.wikipedia.org/wiki/Zulu"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

In [5]:
DATA_PATH = 'data/2007/'
articles = pd.read_csv(os.path.join(DATA_PATH, 'articles.tsv'), sep='\t', comment='#', names=['article'])
urls = articles['article'].tolist()

In [None]:
article_links, article_names = scr.scrape_wikipedia_articles(urls)

Scraping Wikipedia articles:   8%|▊         | 375/4604 [04:31<55:49,  1.26article/s]  

Error scraping Athletics_%28track_and_field%29: 404


Scraping Wikipedia articles:  13%|█▎        | 588/4604 [07:17<53:14,  1.26article/s]  

Error scraping Bionicle__Mask_of_Light: 404


Scraping Wikipedia articles:  26%|██▋       | 1211/4604 [15:13<40:52,  1.38article/s]  

Error scraping Directdebit: 404


Scraping Wikipedia articles:  35%|███▍      | 1601/4604 [20:10<38:57,  1.28article/s]  

Error scraping Friend_Directdebit: 404


Scraping Wikipedia articles:  35%|███▌      | 1628/4604 [20:25<32:53,  1.51article/s]

Error scraping Gallery_of_the_Kings_and_Queens_of_England: 404


Scraping Wikipedia articles:  65%|██████▍   | 2970/4604 [37:35<21:53,  1.24article/s]  

Error scraping Newshounds: 404


Scraping Wikipedia articles:  84%|████████▎ | 3850/4604 [48:22<08:08,  1.54article/s]  

Error scraping Sponsorship_Directdebit: 404


Scraping Wikipedia articles:  84%|████████▍ | 3879/4604 [48:45<12:01,  1.00article/s]

Error scraping Star_Wars_Episode_IV__A_New_Hope: 404


Scraping Wikipedia articles:  97%|█████████▋| 4481/4604 [56:15<01:47,  1.14article/s]

Error scraping Wikipedia_Text_of_the_GNU_Free_Documentation_License: 404


Scraping Wikipedia articles:  99%|█████████▊| 4546/4604 [56:59<00:38,  1.49article/s]

Error scraping Wowpurchase: 404


Scraping Wikipedia articles:  99%|█████████▉| 4553/4604 [57:04<00:35,  1.45article/s]

Error scraping X-Men__The_Last_Stand: 404


Scraping Wikipedia articles: 100%|██████████| 4604/4604 [57:40<00:00,  1.33article/s]


In [None]:
# Removing duplicate links in 2024
article_links = pd.DataFrame.from_dict(article_links).drop_duplicates()
article_names = pd.DataFrame.from_dict(article_names).drop_duplicates()

In [None]:
scr.export_links_to_csv(article_links, "data/2024/raw_links2024.csv")
scr.export_articles_to_csv(article_names, "data/2024/raw_articles2024.csv")

In [None]:
links = pd.read_csv(os.path.join(DATA_PATH, 'links.tsv'), sep='\t', comment='#', names=['linkSource', 'linkTarget'])

# compare the number of links by source target in links2024.csv and links.tsv
links2024 = pd.read_csv('data/2024/raw_links2024.csv')

# filter to get the same linkSource in links that are in links2024
links = links[links['linkSource'].isin(links2024['linkSource'])]

# Count the number of ouput links by source target
links_count = links.groupby(['linkSource']).size().reset_index(name='count')
links2024_count = links2024.groupby(['linkSource']).size().reset_index(name='count')

# compare the number of links by source target in links2024.csv and links.tsv
comparison = np.abs(links_count['count'] - links2024_count['count'])

In [9]:
print(f"Difference in number of links by article in total between 2007 and now (subset of 500 articles): {comparison.sum() / links.shape[0] * 100:.2f}%")
comparison.describe()

Difference in number of links by article in total between 2007 and now (subset of 500 articles): 251.97%


count    4535.000000
mean       66.196913
std        88.924844
min         0.000000
25%        12.000000
50%        33.000000
75%        79.000000
max      1374.000000
Name: count, dtype: float64

In [10]:
links2024.shape

(377149, 2)

In [12]:
len(article_names)

4593

In [13]:
articles.shape

(4604, 1)

## Re-scrapping articles from 2024 that changed name since 2007

In [None]:
# Resetting the list of article names to scrap their links from wikipedia

# Starting with the exact same list as in Wikispeedia 2007
actual_article_names_2024 = articles["article_2007"]

# First we update the name of the seven articles we want to keep
# What was their name in 2007
old_unmatched_names = ["Athletics_%28track_and_field%29", "Bionicle__Mask_of_Light", "Directdebit", "Newshounds", "Star_Wars_Episode_IV__A_New_Hope", "Wikipedia_Text_of_the_GNU_Free_Documentation_License", "X-Men__The_Last_Stand"]
# Where were they in the data
unmatched_index = [i for i, article in enumerate(articles['article_2007']) if article in old_unmatched_names]
# What are the articles names now
new_names = ["Track_and_field", "Bionicle:_Mask_of_Light", "Direct_debit", "News_Hounds", "Star_Wars_(film)", "Wikipedia:Text_of_the_GNU_Free_Documentation_License", "X-Men:_The_Last_Stand"]
# Update de list of article names with the seven new names
for i, new_name in enumerate(new_names):
    actual_article_names_2024[unmatched_index[i]] = new_name

# We remove the four articles that do not have an equivalent in 2024
missing_articles_names = ["Friend_Directdebit", "Gallery_of_the_Kings_and_Queens_of_England", "Sponsorship_Directdebit", "Wowpurchase"]
missing_index = [i for i, article in enumerate(articles['article_2007']) if article in missing_articles_names]
actual_articles_names_2024 = actual_article_names_2024.drop(missing_index)

In [None]:
# import helpers for scrapping 2024 wikipedia articles
import src.scripts.scrapper_and_writters as scr

# Specifying which articles to scrap
urls = actual_article_names_2024.tolist()

# Scrapping
missing_articles_links, _ = scr.scrape_wikipedia_articles(urls)

In [None]:
# Removing duplicate links in 2024
missing_articles_links = pd.DataFrame.from_dict(missing_articles_links).drop_duplicates()

In [None]:
scr.export_links_to_csv(missing_articles_links, "data/2024/links2024.csv")
scr.export_articles_to_csv(articles["article_2007"], "data/2024/articles2024.csv")