## Extração dos dumps

In [1]:
%%time
%pip install -q requests tqdm

Note: you may need to restart the kernel to use updated packages.
CPU times: user 182 ms, sys: 37.1 ms, total: 219 ms
Wall time: 7.54 s


In [2]:
import hashlib
import os
import requests
import sys
import shutil
import subprocess
from tqdm import tqdm

from typing import Tuple



def create_multistream_dir() -> None:
    dirs = ['./multistream/compressed', './multistream/decompressed']

    for path in dirs:
        # Create dir recursively
        # https://docs.python.org/3/library/os.html#os.makedirs
        os.makedirs(path, exist_ok=True)


def get_articles_details(dump_url: str) -> dict:
    print('[INFO] Fetching dump info')

    # https://requests.readthedocs.io/en/latest/user/quickstart/#make-a-request
    resp = requests.get(url=dump_url)
    # https://requests.readthedocs.io/en/latest/user/quickstart/#json-response-content
    data = resp.json()

    # (ex) See https://dumps.wikimedia.org/ptwiki/20240720/dumpstatus.json
    articles_details = data['jobs']['articlesmultistreamdumprecombine']

    # Check if 'dumps.wikimedia.org' (API) file's status is set to 'done'
    check_articles_status(articles_details)

    return articles_details


def check_articles_status(articles_details: dict) -> None:
    # Check if 'dumps.wikimedia.org' (API) file's status is set to 'done'
    if articles_details['status'] != 'done':
        raise Exception('\'Article Multistream Dump Precombine\' is not \'done\' (' + articles_details['status'] + ')')


def get_file(dump_url: str) -> Tuple[dict, str]:
    # Retrieve information about article dump from 'dumps.wikimedia.org' (API)
    articles_details = get_articles_details(dump_url)

    # Find file from dumps.wikimedia.org from 'articlesmultistreamdumprecombine' that endings with '-pages-articles-multistream.xml.bz2'
    # Because there is also a '-pages-articles-multistream-index.txt.bz2' file included
    # (ex) See https://dumps.wikimedia.org/ptwiki/20240720/dumpstatus.json
    key = [file for file in articles_details['files'].keys() if file.endswith('-pages-articles-multistream.xml.bz2')][0]
    file = articles_details['files'][key]
    
    return file, get_filename(file)


def get_filename(file: dict) -> str:
    # (ex) /ptwiki/20240720/ptwiki-20240720-pages-articles-multistream.xml.bz2 -> 
    #   -> ptwiki-20240720-pages-articles-multistream.xml.bz2
    return file['url'].split('/')[-1]


def check_sha1(path: str, filename: str) -> bool:
    print('[INFO] Checking SHA1 Checksum of \'' + filename + '\'')

    # https://docs.python.org/3/library/hashlib.html#hashlib.sha1
    sha1 = hashlib.sha1()

    # https://stackoverflow.com/a/22058673
    with open(path, 'rb') as f:
        while True:
            data = f.read(65536) # BUFF_SIZE (arbitrary value)
            if not data:
                break
            sha1.update(data)

    # Returns SHA1 in hex format
    # https://docs.python.org/3/library/hashlib.html#hashlib.hash.hexdigest
    return sha1.hexdigest()


def check_disk_space(file: dict) -> None:
    # Check free space in disk to download .bz2 archive
    _, _, free = shutil.disk_usage("/")
    
    if free < file['size']:
        raise Exception("Not enough disk space (Needed " + file.size + " / "+ free +")")


def download_multistream(file: dict, filename: str, path: str) -> None:
    # Check free space in disk to download .bz2 archive
    check_disk_space(file)

    # (ex) 'https://dumps.wikimedia.org' + '/ptwiki/20240720/ptwiki-20240720-pages-articles-multistream.xml.bz2'
    url = 'https://dumps.wikimedia.org' + file['url']

    # Download file from dumps.wikimedia.org
    with requests.get(url, stream=True) as stream:
        # https://3.python-requests.org/api/#requests.Response.raise_for_status
        stream.raise_for_status()
        # https://stackoverflow.com/a/44299915
        total_size = int(stream.headers.get('content-length', 0))

        # Progress bar
        # https://tqdm.github.io/docs/tqdm/
        with tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading \'' + filename + '\'', initial=0, file=sys.stdout) as pbar:
            with open(path, mode="wb") as multistream_file:
                for chunk in stream.iter_content(chunk_size= 10 * 1024):
                    multistream_file.write(chunk)
                    pbar.update(len(chunk))

    # Compare downloaded file's SHA1 to expected SHA1 from dumps.wikimedia.org (API)
    sha1_hex = check_sha1(path, filename)
    if sha1_hex != file['sha1']:
        raise Exception("SHA1 Checksum did not match (" + sha1_hex + "!=" + file['sha1'] + ")")

    print('[INFO] Download completed')


def get_multistream_file(wikinamedate: str) -> str:
    # (ex) 'https://dumps.wikimedia.org/' + 'ptwiki/20240720' + '/dumpstatus.json'
    dump_url = 'https://dumps.wikimedia.org/' + wikinamedate + '/dumpstatus.json'

    # Get file JSON object from dumps.wikimedia.org API and its filename
    file, filename = get_file(dump_url)
    
    # Check if file was already downloaded and decompressed
    if os.path.isfile(os.path.join('./multistream/decompressed', filename[:-4])):
        print('[INFO] Valid matching multistream found in multistream folder')
    else:
        # Check if file was already downloaded (waiting to be decompressed)
        if os.path.isfile(os.path.join('./multistream/compressed', filename)):
            # Check if downloaded file isn't corrupted
            if check_sha1(os.path.join('./multistream/compressed', filename), filename) != file['sha1']:
                print('[INFO] Invalid matching multistream found in multistream folder')

                # Deleting file from './multistream/compressed' folder
                os.remove(os.path.join('./multistream/compressed', filename))

                # Downloading file again
                download_multistream(file, filename, os.path.join('./multistream/compressed', filename))
            else:
                print('[INFO] Valid matching multistream found in multistream folder')
        else:
            # Downloading file
            download_multistream(file, filename, os.path.join('./multistream/compressed', filename))

        # Extract .bz2 archive
        extract_dump(filename)

    # Remove .bz2 sufix from archive filename
    # (ex) ptwiki-20240720-pages-articles-multistream.xml.bz2 -> ptwiki-20240720-pages-articles-multistream.xml
    return filename[:-4]


def delete_corrupted_xmls() -> None:
    # Delete corrupted/uncompleted extractions
    for file in os.listdir('./multistream/compressed'):
        if file.endswith('.xml'):
            os.remove(os.path.join('./multistream/compressed/', file))


def extract_dump(filename: str) -> None:
    print('[INFO] Extracting multistream .bz2 file (this might take several minutes)')

    # bzip2 -dk ./multistream/compressed/...
    # https://superuser.com/a/480951
    subprocess.run(["bzip2", "-dk", './multistream/compressed/' + filename ])
    
    # https://docs.python.org/3/library/os.html#os.replace
    # https://stackoverflow.com/a/8858026
    os.replace(os.path.join('./multistream/compressed/', filename[:-4]), os.path.join('./multistream/decompressed/', filename[:-4]))


def select_dump() -> Tuple[str, str]:
    # Create required directories
    create_multistream_dir()

    # Delete uncompleted extractions
    delete_corrupted_xmls()

    # Request user's input for a wikiname/date reference
    wikinamedate = input('Fill with wikiname/date [ex: \'ptwiki/20240720\']')
    print()

    # Find, download, check and extract .xml file
    filename = get_multistream_file(wikinamedate)

    print('[INFO] \'' + wikinamedate + '\' selected')

    return wikinamedate, filename

In [3]:
%%time
# Find, download, check and extract .xml file
wikinamedate, filename = select_dump()

# Save variable between different Jupyter notebooks
%store filename wikinamedate

Fill with wikiname/date [ex: 'ptwiki/20240720'] ptwiki/20240720



[INFO] Fetching dump info
[INFO] Valid matching multistream found in multistream folder
[INFO] 'ptwiki/20240720' selected
Stored 'filename' (str)
Stored 'wikinamedate' (str)
CPU times: user 138 ms, sys: 28.3 ms, total: 166 ms
Wall time: 4.99 s
