In [1]:
# trump --> "https://www.wikishark.com/getdata/daily.php?value=4628128_179985?datefrom=01/01/2008&dateto=11/04/2024&view=2&scale=0&normalized=0&loglog=0&log=0&zerofix=0"
# biden --> "https://www.wikishark.com/getdata/daily.php?value=4633610_181444?datefrom=01/01/2008&dateto=11/04/2024&view=2&scale=0&normalized=0&loglog=0&log=0&zerofix=0"
# 
# 
# aller chercher l'élément suivant pour récuperer l'id du mot clé pour la requete suivante:
# <iframe class="box-halo" src="/frame2.php?values=4633610&amp;datefrom=01/01/2008&amp;dateto=11/04/2024&amp;view=2&amp;normalized=0&amp;scale=0&amp;peak=0&amp;log=0&amp;zerofix=0&amp;sumall=0" frameborder="no" style="margin-top: 7px;background:url(/img/loader.gif) center center no-repeat;width: 90%; height: CALC(100vh - 270px) ;overflow: hidden;border: 0px solid" scrolling="yes"></iframe>

In [None]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup

import datetime

In [None]:
def get_log_data(keywords):
    """
    Fetch monthly wikipedia log views data from wikishark for the given keywords.

    Args:
        keywords (list): title articles to fetch data for

    Returns:
        A pandas DataFrame with the monthly log views for each keyword
    """
    start_date_dataset = datetime.datetime(year=2008, month=8, day=18)
    end_date_dataset = datetime.datetime(year=2014, month=1, day=15)
    start_date_website = datetime.datetime(year=2008, month=1, day=1)
    
    start_idx = (start_date_website - start_date_dataset).days
    length_history = (end_date_dataset - start_date_dataset).days + 1
    
    data = []
    for i, keyword in enumerate(keywords):
        print("({:03d}/{}) Fetching data for {:<30} ".format(i+1, len(keywords), keyword), end="")

        try:
            url = f"https://www.wikishark.com/title/en/{keyword}?text_search="
            response = requests.get(url)
            soup = BeautifulSoup(response.text, "html.parser")
            iframe = soup.find("iframe", class_="box-halo")
            keyword_id = iframe["src"].split("values=")[1].split("&")[0]
            print(f"| id found {keyword_id} ", end="")

            url = f"https://www.wikishark.com/getdata/daily.php?value={keyword_id}"
            response = requests.get(url)
            k_data = response.json()
            print(f"| SUCCESS")

            k_data = k_data[-start_idx:-start_idx + length_history]
            data.append(k_data)
        
        except Exception as e:
            print(f" > Error fetching data for {keyword}: {e}")
            data.append([np.nan] * length_history)

    indices = np.arange(np.datetime64("2008-08-18"), np.datetime64("2014-01-16"), np.timedelta64(1, "D"))
    values = np.array(data).T

    df = pd.DataFrame(values, index=indices, columns=keywords)
    df = df.groupby(by=[df.index.year, df.index.month]).mean()
    return df

In [5]:
def get_errors(df):
    errors = []
    for keyword in df.columns:
        if df[keyword].isnull().sum() > 0:
            errors.append(keyword)
    return errors

In [None]:
hubs = np.load("data/processed/hubs.npy", allow_pickle=True).tolist()

df = get_log_data(hubs)

df.to_csv("data/processed/hubs_log_data.csv", index_label=["year", "month"])

(001/833) Fetching data for Europe                         | id found 4632177 | SUCCESS
(002/833) Fetching data for London                         | id found 4633173 | SUCCESS
(003/833) Fetching data for Natural_gas                    | id found 4601325 | SUCCESS
(004/833) Fetching data for Christianity                   | id found 4633076 | SUCCESS
(005/833) Fetching data for Agriculture                    | id found 4630918 | SUCCESS
(006/833) Fetching data for England                        | id found 4633703 | SUCCESS
(007/833) Fetching data for World_War_II                   | id found 4634039 | SUCCESS
(008/833) Fetching data for Human                          | id found 4632523 | SUCCESS
(009/833) Fetching data for India                          | id found 4633162 | SUCCESS
(010/833) Fetching data for Mathematics                    | id found 4606219 | SUCCESS
(011/833) Fetching data for Science                        | id found 4621807 | SUCCESS
(012/833) Fetching data for Dino

In [None]:
#following article titles need to be checked on wikishark
get_errors(df)

['Elizabeth_II_of_the_United_Kingdom',
 'Earth%27s_atmosphere',
 'Football_%28soccer%29',
 'British_monarchy',
 'List_of_elements_by_name',
 'Mythology',
 'British_East_India_Company',
 'Computer_and_video_games',
 'Victoria_of_the_United_Kingdom',
 'Edible_salt',
 'British_House_of_Commons',
 'Temperate',
 'Old_English_language',
 'The_Lord_of_the_Rings_film_trilogy',
 'September_11%2C_2001_attacks',
 'Star_Wars_Episode_IV__A_New_Hope']

In [None]:
new_keywords = ["Elizabeth_II", "Atmosphere_of_Earth", "Football", "Monarchy_of_the_United_Kingdom", "List_of_elements", "Greek_mythology", 
                "East_India_Company", "Computer", "Queen_Victoria", "Salt", "Parliament_of_the_United_Kingdom", "Temperate_climate",
                "Old_English", "The_Lord_of_the_Rings_(film_series)", "September_11_attacks", "Star_Wars_(film)"]

In [9]:
def fix_errors(path, new_keywords):
    df = pd.read_csv(path, index_col=["year", "month"])
    errors = get_errors(df)

    assert len(new_keywords) == len(errors), "Number of new keywords must match number of errors"

    df = pd.read_csv("data/processed/hubs_log_data.csv", index_col=["year", "month"])
    
    new_df = get_log_data(new_keywords)
    for error, new_keyword in zip(errors, new_keywords):
        df[error] = new_df[new_keyword]
    
    df.to_csv("data/processed/hubs_log_data.csv", index_label=["year", "month"])

In [10]:
path = "data/processed/hubs_log_data.csv"
fix_errors(path, new_keywords)

get_errors(pd.read_csv(path, index_col=["year", "month"]))

(001/16) Fetching data for Elizabeth_II                   | id found 4625886 | SUCCESS
(002/16) Fetching data for Atmosphere_of_Earth            | id found 4576004 | SUCCESS
(003/16) Fetching data for Football                       | id found 4624706 | SUCCESS
(004/16) Fetching data for Monarchy_of_the_United_Kingdom | id found 4620959 | SUCCESS
(005/16) Fetching data for List_of_elements               | id found 4536558 | SUCCESS
(006/16) Fetching data for Greek_mythology                | id found 4623100 | SUCCESS
(007/16) Fetching data for East_India_Company             | id found 4616688 | SUCCESS
(008/16) Fetching data for Computer                       | id found 4621533 | SUCCESS
(009/16) Fetching data for Queen_Victoria                 | id found 4623331 | SUCCESS
(010/16) Fetching data for salt                            > Error fetching data for salt: 'NoneType' object is not subscriptable
(011/16) Fetching data for Parliament_of_the_United_Kingdom | id found 4607059 | SUCCES

['Edible_salt']

In [11]:
path = "data/processed/hubs_log_data.csv"
fix_errors(path, ["Salt"])

get_errors(pd.read_csv(path, index_col=["year", "month"]))

(001/1) Fetching data for Salt                           | id found 4586766 | SUCCESS


[]

In [12]:
path = "data/processed/hubs_log_data.csv"
df = pd.read_csv(path, index_col=["year", "month"])

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Europe,London,Natural_gas,Christianity,Agriculture,England,World_War_II,Human,India,Mathematics,...,Bermuda,Smallpox,Mohs_scale_of_mineral_hardness,Rwanda,Munich,Mauritius,Oceania,Gone_with_the_Wind_%28film%29,Whale,Pompeii
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2008,8,11132.071429,12461.214286,2808.928571,4791.642857,2358.857143,11007.500000,11040.142857,4278.142857,17403.071429,3663.571429,...,3118.857143,2211.642857,563.500000,1630.714286,2781.000000,3086.357143,2374.142857,2034.571429,2108.428571,2844.428571
2008,9,12498.366667,13051.600000,3470.200000,6934.500000,3172.833333,12710.966667,17497.400000,5848.300000,21387.500000,4653.766667,...,3167.166667,2901.266667,903.966667,2372.300000,3061.833333,3178.533333,2301.933333,1608.000000,1838.300000,3045.300000
2008,10,11848.741935,12813.612903,3175.612903,7532.709677,3326.483871,13112.129032,19662.612903,6263.838710,22349.935484,4517.129032,...,2549.225806,3055.129032,903.193548,3089.612903,2816.032258,3452.677419,2155.806452,1377.903226,2126.193548,3637.000000
2008,11,11797.700000,13176.366667,3079.666667,7539.766667,3224.866667,13773.400000,25930.833333,6433.000000,26366.666667,4529.866667,...,2423.600000,3280.600000,898.100000,3748.066667,2627.500000,3555.433333,2171.966667,1690.966667,2191.266667,3693.466667
2008,12,9659.967742,11149.096774,2370.548387,6639.548387,2395.000000,11439.451613,20950.709677,5741.161290,21469.322581,3706.645161,...,2158.193548,2828.290323,801.516129,3496.064516,2517.225806,3499.741935,1814.548387,1930.967742,1691.677419,2957.870968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,9,8528.766667,12381.900000,2651.266667,4945.866667,3908.133333,8489.633333,33415.900000,6701.333333,22523.666667,4867.400000,...,3181.266667,3354.366667,1496.033333,2244.133333,3374.300000,5580.866667,2300.166667,2479.033333,2085.333333,5357.833333
2013,10,9978.225806,13473.322581,3094.612903,5651.677419,4723.838710,9632.387097,51003.645161,7173.451613,27272.193548,5606.322581,...,3409.387097,4759.935484,1717.838710,2379.419355,3440.161290,6847.741935,2303.161290,2843.322581,2560.774194,5801.838710
2013,11,9216.400000,13554.000000,3404.533333,5092.966667,4107.133333,9496.800000,51381.800000,7477.666667,36408.333333,5499.833333,...,3162.266667,4260.400000,1732.500000,2386.466667,3251.566667,6653.133333,2329.433333,4359.300000,2622.533333,5715.500000
2013,12,6823.967742,11505.935484,2888.806452,5151.580645,3004.870968,7705.129032,29324.387097,5829.419355,31316.193548,4074.645161,...,2689.903226,4524.548387,1513.129032,2174.322581,2680.548387,5166.451613,1916.354839,4158.354839,1866.354839,5944.612903
