In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
from time import sleep
import sys

In [None]:
__file__ = os.getcwd()
__file__

In [3]:
# Add the parent directory to the Python path
os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), '')))
         
# Import the module
import utils.scrapper_and_writters as scr

In [4]:
DATA_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '../data/2007/'))
articles = pd.read_csv(os.path.join(DATA_PATH, 'articles.tsv'), sep='\t', comment='#', names=['article'])
urls = articles['article'].tolist()

## Scrapping articles' links from the articles names of Wikispeedia 2007

NB: running the scrapping parts will take at least 30 minutes for each scrapping. Meaning a total of at least 1h30min.

In [None]:
article_links, article_names = scr.scrape_wikipedia_articles(urls)

In [None]:
# Removing duplicate links in 2024
article_links = pd.DataFrame.from_dict(article_links).drop_duplicates()
article_names = pd.DataFrame.from_dict(article_names).drop_duplicates()

In [None]:
exporting = False
if exporting:
    scr.export_df_links_to_csv(article_links, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data/2024/raw_links2024.csv")))
    scr.export_articles_to_csv(article_names, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data/2024/raw_articles2024.csv")))

In [None]:
links = pd.read_csv(os.path.join(DATA_PATH, 'links.tsv'), sep='\t', comment='#', names=['linkSource', 'linkTarget'])

# compare the number of links by source target in links2024.csv and links.tsv
links2024 = pd.read_csv('data/2024/raw_links2024.csv')

# filter to get the same linkSource in links that are in links2024
links = links[links['linkSource'].isin(links2024['linkSource'])]

# Count the number of ouput links by source target
links_count = links.groupby(['linkSource']).size().reset_index(name='count')
links2024_count = links2024.groupby(['linkSource']).size().reset_index(name='count')

# compare the number of links by source target in links2024.csv and links.tsv
comparison = np.abs(links_count['count'] - links2024_count['count'])

In [None]:
print(f"Difference in number of links by article in total between 2007 and now (subset of 500 articles): {comparison.sum() / links.shape[0] * 100:.2f}%")
comparison.describe()

In [None]:
links2024.shape

In [None]:
len(article_names)

In [None]:
articles.shape

## Re-scrapping links from 2024 articles that changed name since 2007

In [None]:
# Resetting the list of article names to scrap their links from wikipedia

# Starting with the exact same list as in Wikispeedia 2007
actual_article_names_2024 = articles["article_2007"]


# First we update the name of the seven articles we want to keep
# What was their name in 2007
old_unmatched_names = ["Athletics_%28track_and_field%29",
                       "Bionicle__Mask_of_Light", 
                       "Directdebit",
                       "Newshounds",
                       "Star_Wars_Episode_IV__A_New_Hope",
                       "Wikipedia_Text_of_the_GNU_Free_Documentation_License",
                       "X-Men__The_Last_Stand"]
# Where were they in the data
unmatched_index = [i for i, article in enumerate(articles['article_2007']) if article in old_unmatched_names]
# What are the articles names now
new_names = ["Track_and_field",
             "Bionicle:_Mask_of_Light",
             "Direct_debit",
             "News_Hounds",
             "Star_Wars_(film)",
             "Wikipedia:Text_of_the_GNU_Free_Documentation_License",
             "X-Men:_The_Last_Stand"]
# Update de list of article names with the seven new names
for i, new_name in enumerate(new_names):
    actual_article_names_2024[unmatched_index[i]] = new_name

# We remove the four articles that do not have an equivalent in 2024
missing_articles_names = ["Friend_Directdebit", "Gallery_of_the_Kings_and_Queens_of_England", "Sponsorship_Directdebit", "Wowpurchase"]
missing_index = [i for i, article in enumerate(articles['article_2007']) if article in missing_articles_names]
actual_article_names_2024 = actual_article_names_2024.drop(missing_index)

In [None]:
# Specifying which articles to scrap
urls = actual_article_names_2024.tolist()

# Scrapping
missing_articles_links, _ = scr.scrape_wikipedia_articles(urls)

In [None]:
# Reformating the names of all new names to the old names
missing_articles_links = pd.DataFrame.from_dict(missing_articles_links)
missing_articles_links = missing_articles_links.replace(to_replace = new_names, value = old_unmatched_names)

In [None]:
# Removing duplicate links in 2024
missing_articles_links = missing_articles_links.drop_duplicates()

In [None]:
exporting = False
if exporting:
    scr.export_df_links_to_csv(missing_articles_links, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data/2024/raw_links2024.csv")))
    scr.export_articles_to_csv(articles["article_2007"], os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data/2024/raw_articles2024.csv")))

## Scrapping back disambiguation pages


In [None]:
deleted_articles = ["Friend_Directdebit",
                    "Gallery_of_the_Kings_and_Queens_of_England",
                    "Sponsorship_Directdebit",
                    "Wowpurchase"]

old_unmatched_names = ["Athletics_%28track_and_field%29",
                       "Bionicle__Mask_of_Light", 
                       "Directdebit",
                       "Newshounds",
                       "Star_Wars_Episode_IV__A_New_Hope",
                       "Wikipedia_Text_of_the_GNU_Free_Documentation_License",
                       "X-Men__The_Last_Stand"]

new_names = ["Track_and_field",
             "Bionicle:_Mask_of_Light",
             "Direct_debit",
             "News_Hounds",
             "Star_Wars_(film)",
             "Wikipedia:Text_of_the_GNU_Free_Documentation_License",
             "X-Men:_The_Last_Stand"]

old_ambiguous_names = ["Aggregator",
                       "Anne_of_Great_Britain",
                       "Bantu",
                       "Battle_of_Amiens",
                       "Blackbird",
                       "Bj%C3%B8rn%C3%B8ya",
                       "Boa",
                       "Boston_RFC",
                       "Brabantian",
                       "Dark_Ages",
                       "David_Heymann",
                       "Defaka",
                       "Doom",
                       "Firecrest",
                       "Forth",
                       "Garage_%28dance_music%29",
                       "Herring_Gull",
                       "Industry",
                       "Lake_Albert",
                       "Mark_Webber",
                       "Market",
                       "Nagorno-Karabakh_War",
                       "Newmarket",
                       "Pochard",
                       "Prehistoric_man",
                       "Recorder",
                       "Red_Panda",
                       "Sandur",
                       "Scent_of_a_Woman",
                       "Sequoia",
                       "Serenity_%28film%29",
                       "Sparrowhawk",
                       "Swift",
                       "Terik",
                       "Tooth_development",
                       "Tripoli",
                       "Underground_%28stories%29",
                       "Weymouth",
                       "Whitethroat",
                       "William_Gilbert",
                       "Winfield_Scott_%28ship%29",
                       "Woodruff",
                       "Zulu"]

new_disambiguous_names = ["News_aggregator",
                        "Anne,_Queen_of_Great_Britain",
                        "Bantu_peoples",
                        "Battle_of_Amiens_(1918)",
                        "Common_blackbird",
                        "Bear_Island_(Svalbard)",
                        "Boa_(genus)",
                        "Boston_RFC_(United_States)",
                        "Brabantian_Dutch",
                        "Dark_Ages_(historiography)",
                        "David_Heymann_(architect)",
                        "Defaka_people",
                        "Doom_(1993_video_game)",
                        "Common_firecrest",
                        "Forth_(programming_language)",
                        "Garage_house",
                        "American_herring_gull",
                        "Industry_(economics)",
                        "Lake_Albert_(Africa)",
                        "Mark_Webber_(racing_driver)",
                        "Market_(economics)",
                        "First_Nagorno-Karabakh_War",
                        "Newmarket,_Suffolk",
                        "Common_pochard",
                        "Prehistory",
                        "Recorder_(musical_instrument)",
                        "Red_panda",
                        "Outwash_plain",
                        "Scent_of_a_Woman_(1992_film)",
                        "Sequoia_(genus)",
                        "Serenity_(2005_film)",
                        "Eurasian_sparrowhawk",
                        "Swift_(bird)",
                        "Terik_people",
                        "Human_tooth_development",
                        "Tripoli,_Libya",
                        "Underground_(Murakami_book)",
                        "Weymouth,_Dorset",
                        "Common_whitethroat",
                        "William_Gilbert_(physicist)",
                        "SS_Winfield_Scott",
                        "Galium_odoratum",
                        "Zulu_people"]

print(len(old_ambiguous_names), len(new_disambiguous_names))

In [None]:
updated_names = articles.replace(old_ambiguous_names, new_disambiguous_names)
updated_names = updated_names.replace(old_unmatched_names, new_names)

# We remove the four articles that do not have an equivalent in 2024
missing_index = [i for i, article in enumerate(articles['article']) if article in deleted_articles]
actual_article_names_2024 = updated_names.drop(missing_index)

actual_article_names_2024

In [None]:
urls = actual_article_names_2024.article.tolist()

# Scraping
# TODO rescrape with changed name for article names variable
disamb_articles_links, disamb_names = scr.scrape_wikipedia_articles(urls)

In [40]:
# make the dictionnary of the links into a list
list_disamb_articles_links = []
for key in disamb_articles_links.keys():
    for value in disamb_articles_links[key]:
        list_disamb_articles_links.append([key, value])

In [48]:
# Reformating the names of all new names to the old names
df_disamb_articles_links = pd.DataFrame(list_disamb_articles_links)
df_disamb_articles_links = df_disamb_articles_links.replace(to_replace = new_disambiguous_names, value = old_ambiguous_names)
df_disamb_articles_links = df_disamb_articles_links.replace(to_replace = new_names, value = old_unmatched_names)

# Removing duplicate links in 2024
df_disamb_articles_links = df_disamb_articles_links.drop_duplicates()

In [49]:
exporting = False
if exporting:
    scr.export_df_links_to_csv(df_disamb_articles_links, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data/2024/raw_links2024.csv")))
    scr.export_articles_to_csv(disamb_names, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data/2024/raw_articles2024.csv")))
    