In [2]:
import requests
# Library for parsing HTML
from bs4 import BeautifulSoup
base_url = 'https://dumps.wikimedia.org'
enwiki_url = base_url + '/enwiki'
index = requests.get(enwiki_url).text
soup_index = BeautifulSoup(index, 'html.parser')
# Find the links on the page
dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.has_attr('href')]
dumps

['../',
 '20210420/',
 '20210501/',
 '20210520/',
 '20210601/',
 '20210620/',
 '20210701/',
 '20210720/',
 'latest/']

In [3]:
dump_url = enwiki_url + '/20210720'
# Retrieve the html
dump_html = requests.get(dump_url).text
# Convert to a soup
soup_dump = BeautifulSoup(dump_html, 'html.parser')

In [4]:
import requests
from pathlib import Path
from tqdm import tqdm

wikipedia_dir = Path.home() / 'wikipedia'
wikipedia_dir.mkdir(parents=True, exist_ok=True)

In [5]:
# Find list elements with the class file
targets = [i.a.attrs["href"] for i in soup_dump.find_all('li', {'class': 'file'}) if "multistream" in str(i)]
destinations = [t.split('/')[-1] for t in targets]

In [4]:
# Define the remote file to retrieve
for target, destination in zip(targets, destinations):
    print('target: ', base_url + target)
    print('destination: ', wikipedia_dir / destination)
    response = requests.get(base_url + target, stream=True)
    total_size_in_bytes= int(response.headers.get('content-length', 0))
    block_size = 1024
    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
    with (wikipedia_dir / destination).open('wb')as f:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            f.write(data)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream.xml.bz2


100%|██████████| 19.9G/19.9G [2:02:17<00:00, 2.71MiB/s]  


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index.txt.bz2


100%|██████████| 235M/235M [01:25<00:00, 2.73MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream1.xml-p1p41242.bz2


100%|██████████| 251M/251M [01:30<00:00, 2.78MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index1.txt-p1p41242.bz2


100%|██████████| 227k/227k [00:00<00:00, 313kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream2.xml-p41243p151573.bz2


100%|██████████| 339M/339M [01:49<00:00, 3.10MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index2.txt-p41243p151573.bz2


100%|██████████| 653k/653k [00:01<00:00, 394kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream3.xml-p151574p311329.bz2


100%|██████████| 367M/367M [02:08<00:00, 2.86MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index3.txt-p151574p311329.bz2


100%|██████████| 841k/841k [00:03<00:00, 274kiB/s]  


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream4.xml-p311330p558391.bz2


100%|██████████| 408M/408M [02:33<00:00, 2.66MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index4.txt-p311330p558391.bz2


100%|██████████| 1.35M/1.35M [00:01<00:00, 817kiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream5.xml-p558392p958045.bz2


100%|██████████| 440M/440M [02:24<00:00, 3.05MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index5.txt-p558392p958045.bz2


100%|██████████| 2.16M/2.16M [00:01<00:00, 1.71MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream6.xml-p958046p1483661.bz2


100%|██████████| 472M/472M [02:39<00:00, 2.97MiB/s]  


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index6.txt-p958046p1483661.bz2


100%|██████████| 2.62M/2.62M [00:04<00:00, 546kiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream7.xml-p1483662p2134111.bz2


100%|██████████| 487M/487M [02:55<00:00, 2.78MiB/s]   


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index7.txt-p1483662p2134111.bz2


100%|██████████| 3.15M/3.15M [00:06<00:00, 486kiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream8.xml-p2134112p2936260.bz2


100%|██████████| 500M/500M [02:52<00:00, 2.89MiB/s]  


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index8.txt-p2134112p2936260.bz2


100%|██████████| 3.78M/3.78M [00:05<00:00, 708kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream9.xml-p2936261p4045402.bz2


100%|██████████| 545M/545M [03:20<00:00, 2.72MiB/s]  


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index9.txt-p2936261p4045402.bz2


100%|██████████| 4.45M/4.45M [00:02<00:00, 2.14MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream10.xml-p4045403p5399366.bz2


100%|██████████| 538M/538M [03:10<00:00, 2.82MiB/s]  


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index10.txt-p4045403p5399366.bz2


100%|██████████| 4.90M/4.90M [00:04<00:00, 1.22MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream11.xml-p5399367p6899366.bz2


100%|██████████| 525M/525M [03:07<00:00, 2.80MiB/s]  


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index11.txt-p5399367p6899366.bz2


100%|██████████| 5.11M/5.11M [00:06<00:00, 799kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream11.xml-p6899367p7054859.bz2


100%|██████████| 50.3M/50.3M [00:22<00:00, 2.28MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index11.txt-p6899367p7054859.bz2


100%|██████████| 494k/494k [00:00<00:00, 610kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream12.xml-p7054860p8554859.bz2


100%|██████████| 437M/437M [02:36<00:00, 2.79MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index12.txt-p7054860p8554859.bz2


100%|██████████| 4.66M/4.66M [00:03<00:00, 1.49MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream12.xml-p8554860p9172788.bz2


100%|██████████| 177M/177M [01:03<00:00, 2.79MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index12.txt-p8554860p9172788.bz2


100%|██████████| 1.88M/1.88M [00:05<00:00, 371kiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream13.xml-p9172789p10672788.bz2


100%|██████████| 356M/356M [02:04<00:00, 2.86MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index13.txt-p9172789p10672788.bz2


100%|██████████| 3.94M/3.94M [00:05<00:00, 760kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream13.xml-p10672789p11659682.bz2


100%|██████████| 248M/248M [01:39<00:00, 2.49MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index13.txt-p10672789p11659682.bz2


100%|██████████| 2.73M/2.73M [00:03<00:00, 716kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream14.xml-p11659683p13159682.bz2


100%|██████████| 428M/428M [02:36<00:00, 2.75MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index14.txt-p11659683p13159682.bz2


100%|██████████| 5.14M/5.14M [00:03<00:00, 1.60MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream14.xml-p13159683p14324602.bz2


100%|██████████| 296M/296M [01:49<00:00, 2.70MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index14.txt-p13159683p14324602.bz2


100%|██████████| 3.63M/3.63M [00:04<00:00, 791kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream15.xml-p14324603p15824602.bz2


100%|██████████| 387M/387M [02:20<00:00, 2.75MiB/s]   


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index15.txt-p14324603p15824602.bz2


100%|██████████| 5.07M/5.07M [00:04<00:00, 1.22MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream15.xml-p15824603p17324602.bz2


100%|██████████| 337M/337M [02:00<00:00, 2.79MiB/s]   


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index15.txt-p15824603p17324602.bz2


100%|██████████| 5.39M/5.39M [00:02<00:00, 1.99MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream15.xml-p17324603p17460152.bz2


100%|██████████| 30.4M/30.4M [00:16<00:00, 1.80MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index15.txt-p17324603p17460152.bz2


100%|██████████| 372k/372k [00:00<00:00, 624kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream16.xml-p17460153p18960152.bz2


100%|██████████| 365M/365M [02:20<00:00, 2.61MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index16.txt-p17460153p18960152.bz2


100%|██████████| 4.58M/4.58M [00:03<00:00, 1.16MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream16.xml-p18960153p20460152.bz2


100%|██████████| 341M/341M [02:14<00:00, 2.53MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index16.txt-p18960153p20460152.bz2


100%|██████████| 4.54M/4.54M [00:02<00:00, 1.56MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream16.xml-p20460153p20570392.bz2


100%|██████████| 24.8M/24.8M [00:12<00:00, 2.02MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index16.txt-p20460153p20570392.bz2


100%|██████████| 296k/296k [00:01<00:00, 294kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream17.xml-p20570393p22070392.bz2


100%|██████████| 380M/380M [02:02<00:00, 3.11MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index17.txt-p20570393p22070392.bz2


100%|██████████| 4.77M/4.77M [00:06<00:00, 768kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream17.xml-p22070393p23570392.bz2


100%|██████████| 392M/392M [02:11<00:00, 2.98MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index17.txt-p22070393p23570392.bz2


100%|██████████| 5.32M/5.32M [00:08<00:00, 641kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream17.xml-p23570393p23716197.bz2


100%|██████████| 43.5M/43.5M [00:20<00:00, 2.15MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index17.txt-p23570393p23716197.bz2


100%|██████████| 579k/579k [00:01<00:00, 537kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream18.xml-p23716198p25216197.bz2


100%|██████████| 405M/405M [02:11<00:00, 3.08MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index18.txt-p23716198p25216197.bz2


100%|██████████| 5.12M/5.12M [00:04<00:00, 1.23MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream18.xml-p25216198p26716197.bz2


100%|██████████| 374M/374M [02:06<00:00, 2.96MiB/s]   


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index18.txt-p25216198p26716197.bz2


100%|██████████| 4.75M/4.75M [00:03<00:00, 1.38MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream18.xml-p26716198p27121850.bz2


100%|██████████| 94.8M/94.8M [00:47<00:00, 2.00MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index18.txt-p26716198p27121850.bz2


100%|██████████| 1.42M/1.42M [00:03<00:00, 367kiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream19.xml-p27121851p28621850.bz2


100%|██████████| 367M/367M [02:38<00:00, 2.31MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index19.txt-p27121851p28621850.bz2


100%|██████████| 5.43M/5.43M [00:05<00:00, 939kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream19.xml-p28621851p30121850.bz2


100%|██████████| 323M/323M [01:59<00:00, 2.69MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index19.txt-p28621851p30121850.bz2


100%|██████████| 5.07M/5.07M [00:06<00:00, 764kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream19.xml-p30121851p31308442.bz2


100%|██████████| 302M/302M [01:57<00:00, 2.56MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index19.txt-p30121851p31308442.bz2


100%|██████████| 3.69M/3.69M [00:05<00:00, 714kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream20.xml-p31308443p32808442.bz2


100%|██████████| 410M/410M [02:44<00:00, 2.50MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index20.txt-p31308443p32808442.bz2


100%|██████████| 5.29M/5.29M [00:07<00:00, 732kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream20.xml-p32808443p34308442.bz2


100%|██████████| 378M/378M [02:17<00:00, 2.75MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index20.txt-p32808443p34308442.bz2


100%|██████████| 5.04M/5.04M [00:07<00:00, 708kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream20.xml-p34308443p35522432.bz2


100%|██████████| 279M/279M [01:55<00:00, 2.41MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index20.txt-p34308443p35522432.bz2


100%|██████████| 3.91M/3.91M [00:02<00:00, 1.62MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream21.xml-p35522433p37022432.bz2


100%|██████████| 382M/382M [02:26<00:00, 2.61MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index21.txt-p35522433p37022432.bz2


100%|██████████| 5.29M/5.29M [00:02<00:00, 2.37MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream21.xml-p37022433p38522432.bz2


100%|██████████| 370M/370M [02:37<00:00, 2.35MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index21.txt-p37022433p38522432.bz2


100%|██████████| 5.17M/5.17M [00:04<00:00, 1.17MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream21.xml-p38522433p39996245.bz2


100%|██████████| 374M/374M [02:32<00:00, 2.45MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index21.txt-p38522433p39996245.bz2


100%|██████████| 5.15M/5.15M [00:05<00:00, 896kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream22.xml-p39996246p41496245.bz2


100%|██████████| 368M/368M [02:15<00:00, 2.72MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index22.txt-p39996246p41496245.bz2


100%|██████████| 5.02M/5.02M [00:04<00:00, 1.07MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream22.xml-p41496246p42996245.bz2


100%|██████████| 378M/378M [02:09<00:00, 2.93MiB/s]   


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index22.txt-p41496246p42996245.bz2


100%|██████████| 5.13M/5.13M [00:06<00:00, 794kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream22.xml-p42996246p44496245.bz2


100%|██████████| 384M/384M [02:17<00:00, 2.80MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index22.txt-p42996246p44496245.bz2


100%|██████████| 5.58M/5.58M [00:06<00:00, 858kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream22.xml-p44496246p44788941.bz2


100%|██████████| 60.5M/60.5M [00:40<00:00, 1.50MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index22.txt-p44496246p44788941.bz2


100%|██████████| 861k/861k [00:02<00:00, 327kiB/s]  


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream23.xml-p44788942p46288941.bz2


100%|██████████| 246M/246M [01:34<00:00, 2.61MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index23.txt-p44788942p46288941.bz2


100%|██████████| 3.33M/3.33M [00:04<00:00, 735kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream23.xml-p46288942p47788941.bz2


100%|██████████| 393M/393M [02:45<00:00, 2.38MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index23.txt-p46288942p47788941.bz2


100%|██████████| 5.57M/5.57M [00:07<00:00, 794kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream23.xml-p47788942p49288941.bz2


100%|██████████| 330M/330M [02:00<00:00, 2.74MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index23.txt-p47788942p49288941.bz2


100%|██████████| 4.54M/4.54M [00:05<00:00, 772kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream23.xml-p49288942p50564553.bz2


100%|██████████| 259M/259M [01:53<00:00, 2.29MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index23.txt-p49288942p50564553.bz2


100%|██████████| 4.39M/4.39M [00:06<00:00, 628kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream24.xml-p50564554p52064553.bz2


100%|██████████| 351M/351M [02:14<00:00, 2.61MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index24.txt-p50564554p52064553.bz2


100%|██████████| 5.27M/5.27M [00:06<00:00, 799kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream24.xml-p52064554p53564553.bz2


100%|██████████| 351M/351M [02:03<00:00, 2.84MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index24.txt-p52064554p53564553.bz2


100%|██████████| 5.14M/5.14M [00:04<00:00, 1.27MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream24.xml-p53564554p55064553.bz2


100%|██████████| 337M/337M [02:10<00:00, 2.58MiB/s]   


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index24.txt-p53564554p55064553.bz2


100%|██████████| 4.99M/4.99M [00:05<00:00, 863kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream24.xml-p55064554p56564553.bz2


100%|██████████| 354M/354M [02:04<00:00, 2.83MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index24.txt-p55064554p56564553.bz2


100%|██████████| 5.41M/5.41M [00:02<00:00, 1.83MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream24.xml-p56564554p57025655.bz2


100%|██████████| 111M/111M [00:45<00:00, 2.43MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index24.txt-p56564554p57025655.bz2


100%|██████████| 1.63M/1.63M [00:02<00:00, 549kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream25.xml-p57025656p58525655.bz2


100%|██████████| 365M/365M [03:01<00:00, 2.01MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index25.txt-p57025656p58525655.bz2


100%|██████████| 5.13M/5.13M [00:04<00:00, 1.18MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream25.xml-p58525656p60025655.bz2


100%|██████████| 325M/325M [02:24<00:00, 2.26MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index25.txt-p58525656p60025655.bz2


100%|██████████| 5.19M/5.19M [00:05<00:00, 1.01MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream25.xml-p60025656p61525655.bz2


100%|██████████| 360M/360M [02:48<00:00, 2.13MiB/s]   


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index25.txt-p60025656p61525655.bz2


100%|██████████| 5.25M/5.25M [00:13<00:00, 396kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream25.xml-p61525656p62585850.bz2


100%|██████████| 250M/250M [01:54<00:00, 2.18MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index25.txt-p61525656p62585850.bz2


100%|██████████| 3.53M/3.53M [00:04<00:00, 728kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream26.xml-p62585851p63975909.bz2


100%|██████████| 365M/365M [02:39<00:00, 2.29MiB/s]   


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index26.txt-p62585851p63975909.bz2


100%|██████████| 5.14M/5.14M [00:08<00:00, 594kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream27.xml-p63975910p65475909.bz2


100%|██████████| 332M/332M [02:23<00:00, 2.31MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index27.txt-p63975910p65475909.bz2


100%|██████████| 4.78M/4.78M [00:04<00:00, 1.06MiB/s]


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream27.xml-p65475910p66975909.bz2


100%|██████████| 367M/367M [02:47<00:00, 2.19MiB/s]   


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index27.txt-p65475910p66975909.bz2


100%|██████████| 5.18M/5.18M [00:05<00:00, 888kiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream27.xml-p66975910p68286200.bz2


100%|██████████| 341M/341M [02:33<00:00, 2.23MiB/s] 


https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-pages-articles-multistream-index27.txt-p66975910p68286200.bz2


100%|██████████| 5.19M/5.19M [00:02<00:00, 2.21MiB/s]


In [8]:
from typing import Union
from pathlib import Path
from pprint import pprint
from hashlib import md5



def pairwise(iterable):
    return zip(*[iter(iterable)] * 2)

def check_downloads(wiki_dir, destinations, md5sum_url):
    response = requests.get(md5sum_url)
    md5sum_dict = {k: v for v,k in pairwise(response.text.split())}
    for dest in destinations:
        dest_path = Path(wiki_dir / dest)
        assert dest_path.is_file()
        
        with dest_path.open('rb') as f:
            file_hash = md5()
            while chunk := f.read(8192):
                file_hash.update(chunk)
            actual_md5sum = file_hash.hexdigest()
        expected_md5sum = md5sum_dict[dest]
        assert actual_md5sum == expected_md5sum
        print('\033[1m' + 'OK' + '\033[0m', dest_path)
        


In [9]:
check_downloads(wikipedia_dir, destinations, 'https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-md5sums.txt')

[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream.xml.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream-index.txt.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream1.xml-p1p41242.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream-index1.txt-p1p41242.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream2.xml-p41243p151573.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream-index2.txt-p41243p151573.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream3.xml-p151574p311329.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream-index3.txt-p151574p311329.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream4.xml-p311330p558391.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream-index4.txt-p311330p558391.bz2
[1mOK[0m /home

[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream21.xml-p38522433p39996245.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream-index21.txt-p38522433p39996245.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream22.xml-p39996246p41496245.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream-index22.txt-p39996246p41496245.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream22.xml-p41496246p42996245.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream-index22.txt-p41496246p42996245.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream22.xml-p42996246p44496245.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream-index22.txt-p42996246p44496245.bz2
[1mOK[0m /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream22.xml-p44496246p44788941.bz2
[1mOK[0m /home/dnk8n/wikipedi