In [3]:
import requests
from datetime import datetime, timezone
from json import loads
from time import sleep

import pandas as pd
from fake_useragent import UserAgent

In [4]:
language_map = {
    "en": "English",
    "ja": "Japanese",
    "de": "German",
    "ru": "Russian",
    "es": "Spanish",
    "fr": "French",
    "zh": "Chinese",
    "it": "Italian",
    "fa": "Persian",
    "pt": "Portuguese",
}
access_method_map = {
    "desktop": "Desktop",
    "mobile-app": "Mobile App",
    "mobile-web": "Mobile Web",
}
agent_type_map = {
    "user": "User",
    "spider": "Spider",
    "automated": "Automated",
}

start_datetime = "2016010100"
end_datetime = "2024123123"
cols = ["timestamp", "language", "code", "access_method", "agent_type", "views"]
ua = UserAgent()
for lang in language_map:
    df_list = []
    for access_method in access_method_map:
        for agent_type in agent_type_map:
            url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{lang}.wikipedia.org/{access_method}/{agent_type}/hourly/{start_datetime}/{end_datetime}"
            headers = {"User-Agent": ua.random}
            response = requests.get(url, headers=headers)

            df = pd.DataFrame(loads(response.text)["items"])
            df_list.append(df)
            sleep(10)
    
    # Concatenate and create final columns
    df_lang = pd.concat(df_list, axis=0, ignore_index=True).rename(columns={"access": "access_method", "agent": "agent_type"}).drop(columns=["granularity", "project"])
    df_lang["language"] = language_map[lang]
    df_lang["code"] = lang
    df_lang["access_method"] = pd.Categorical(df_lang["access_method"].map(access_method_map), categories=access_method_map.values())
    df_lang["agent_type"] = pd.Categorical(df_lang["agent_type"].map(agent_type_map), categories=agent_type_map.values())
    df_lang["timestamp"] = pd.to_datetime(df_lang["timestamp"], format="%Y%m%d%H")
    
    # Sort by timestamp, access_method, agent_type
    df_lang = df_lang.sort_values(by=["timestamp", "access_method", "agent_type"]).reset_index(drop=True)

    # Reorder columns
    df_lang = df_lang[cols]

    # Save to CSV
    df_lang.to_csv(f"../data/{lang}.csv", index=False)