# Mixing Secrets data download

This notebook was used to download the Mixing Secrets data.

Copyright 2020 InterDigital R&D and Télécom Paris.  
Author: Ondřej Cífka

In [2]:
import bs4
import concurrent.futures as cf
import io
import json
import os
import re
import sys
import time
import traceback
import urllib.request
import warnings
import zipfile

from tqdm.auto import tqdm

In [2]:
HTTP_HEADERS = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}

In [3]:
if os.path.exists('index.html'):
    warnings.warn('index.html already exists, skipping download')
else:
    req = urllib.request.Request('https://www.cambridge-mt.com/ms/mtk/', headers=HTTP_HEADERS)
    with urllib.request.urlopen(req) as response:
        response_str = response.read()
        with open('index.html', 'wb') as f:
            f.write(response_str)

with open('index.html', 'rb') as f:
    soup = bs4.BeautifulSoup(f.read())

  


In [4]:
metadata = []
for genre_e in soup.find_all(class_='c-mtk__genre'):
    for artist_e in genre_e.find_all(class_='c-mtk__artist'):
        for track_e in artist_e.find_all(class_='m-mtk-track'):
            for download_e in track_e.find_all(class_='m-mtk-download'):
                if re.search(r'Full\s+Multitrack', download_e.find(class_='m-mtk-download__type').text):
                    urls = [a['href'] for a in download_e.find(class_='m-mtk-download__links').find_all('a')]
                    urls = [url for url in urls if url.endswith('.zip')]
                    break

            if not urls:
                break

            metadata.append({
                'track_id': re.match(r'^.*/([^/.]+)_Full(_(Pt)?[0-9]+)?.zip$', urls[0]).group(1),
                'track_name': track_e.find(class_='m-mtk-track__name').text,
                'artist': artist_e.find('h4').text,
                'genre': list(artist_e.find(class_='m-container__title-bar').children)[-1].text,
                'category': genre_e.find('h3').text,
                'urls': urls
            })

In [5]:
metadata

[{'track_id': 'AbletonesBigBand_CorineCorine',
  'track_name': "'Corine, Corine'",
  'artist': 'The Abletones Big Band',
  'genre': 'Big Band Jazz',
  'category': 'Acoustic / Jazz / Country / Orchestral',
  'urls': ['https://mtkdata.cambridgemusictechnology.co.uk/Telefunken/AbletonesBigBand_CorineCorine_Full.zip']},
 {'track_id': 'AbletonesBigBand_SongOfIndia',
  'track_name': "'Song Of India'",
  'artist': 'The Abletones Big Band',
  'genre': 'Big Band Jazz',
  'category': 'Acoustic / Jazz / Country / Orchestral',
  'urls': ['https://mtkdata.cambridgemusictechnology.co.uk/Telefunken/AbletonesBigBand_SongOfIndia_Full.zip']},
 {'track_id': 'AlanEvansTrio_ImComingHome',
  'track_name': "'I'm Coming Home'",
  'artist': 'Alan Evans Trio',
  'genre': 'Live Funk Rock',
  'category': 'Acoustic / Jazz / Country / Orchestral',
  'urls': ['https://mtkdata.cambridgemusictechnology.co.uk/Telefunken/AlanEvansTrio_ImComingHome_Full.zip']},
 {'track_id': 'AlejoGranados_RumbaChonta',
  'track_name': "

In [6]:
with open('metadata.json', 'w') as f:
    json.dump(metadata, f)

In [7]:
NUM_RETRIES = 3

def download_file(item):
    track_id, url = item
    extract_dir = os.path.join('download', track_id)
    os.makedirs(extract_dir, exist_ok=True)

    for _ in range(NUM_RETRIES):
        try:
            req = urllib.request.Request(url, headers=HTTP_HEADERS)
            with urllib.request.urlopen(req) as response:
                with zipfile.ZipFile(io.BytesIO(response.read())) as zf:
                    for name in zf.namelist():
                        if os.path.exists(os.path.join(extract_dir, name)):
                            warnings.warn(f'Overwriting {name} from {track_id}')
                    zf.extractall(path=extract_dir)
            return (item, True)
        except:
            traceback.print_exc()
            time.sleep(60)
            
    warnings.warn(f'Failed url: {url} track_id: {track_id}')
    return (item, False)

items = [(item['track_id'], url) for item in metadata for url in item['urls']]
with cf.ProcessPoolExecutor(15) as pool:
    result = list(tqdm(pool.map(download_file, items), total=len(items)))

print('{} / {} files downloaded and extracted successfully'
      .format(sum(success for _, success in result), len(items)), file=sys.stderr)

HBox(children=(FloatProgress(value=0.0, max=438.0), HTML(value='')))

Traceback (most recent call last):
  File "<ipython-input-7-dc8c99107ec1>", line 12, in download_file
    with zipfile.ZipFile(io.BytesIO(response.read())) as zf:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 470, in read
    s = self._safe_read(self.length)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 622, in _safe_read
    raise IncompleteRead(b''.join(s), amt)
http.client.IncompleteRead: IncompleteRead(278313814 bytes read, 168986639 more expected)
Traceback (most recent call last):
  File "<ipython-input-7-dc8c99107ec1>", line 12, in download_file
    with zipfile.ZipFile(io.BytesIO(response.read())) as zf:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 470, in read
    s = self._safe_read(self.length)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 622, in _safe_read
    raise IncompleteRead(b''.join(s), amt)
http.client.IncompleteRead: IncompleteRead(10539844 byte

Traceback (most recent call last):
  File "<ipython-input-7-dc8c99107ec1>", line 12, in download_file
    with zipfile.ZipFile(io.BytesIO(response.read())) as zf:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 470, in read
    s = self._safe_read(self.length)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 622, in _safe_read
    raise IncompleteRead(b''.join(s), amt)
http.client.IncompleteRead: IncompleteRead(341410634 bytes read, 298077245 more expected)
Traceback (most recent call last):
  File "<ipython-input-7-dc8c99107ec1>", line 12, in download_file
    with zipfile.ZipFile(io.BytesIO(response.read())) as zf:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 470, in read
    s = self._safe_read(self.length)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 622, in _safe_read
    raise IncompleteRead(b''.join(s), amt)
http.client.IncompleteRead: IncompleteRead(217478394 byt

  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 641, in http_response
    'http', request, response, code, msg, hdrs)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 569, in error
    return self._call_chain(*args)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 503, in _call_chain
    result = func(*args)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 649, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 522: Origin Connection Time-out
Traceback (most recent call last):
Traceback (most recent call last):
  File "<ipython-input-7-dc8c99107ec1>", line 11, in download_file
    with urllib.request.urlopen(req) as response:
  File "<ipython-input-7-dc8c99107ec1>", line 11, in download_file
    with urllib.request.urlopen(req) as response:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/reques

  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 531, in open
    response = meth(req, response)
Traceback (most recent call last):
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 641, in http_response
    'http', request, response, code, msg, hdrs)
  File "<ipython-input-7-dc8c99107ec1>", line 11, in download_file
    with urllib.request.urlopen(req) as response:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 569, in error
    return self._call_chain(*args)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 503, in _call_chain
    result = func(*args)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 531, in open
    response = meth(req, response)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/u

  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 531, in open
    response = meth(req, response)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 641, in http_response
    'http', request, response, code, msg, hdrs)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 569, in error
    return self._call_chain(*args)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 503, in _call_chain
    result = func(*args)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 649, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 522: Origin Connection Time-out
Traceback (most recent call last):
  File "<ipython-input-7-dc8c99107ec1>", line 11, in download_file
    

  File "<ipython-input-7-dc8c99107ec1>", line 11, in download_file
    with urllib.request.urlopen(req) as response:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 531, in open
    response = meth(req, response)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 641, in http_response
    'http', request, response, code, msg, hdrs)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 569, in error
    return self._call_chain(*args)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 503, in _call_chain
    result = func(*args)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 649, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 522: Origin Connectio

  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 503, in _call_chain
    result = func(*args)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 649, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 522: Origin Connection Time-out
Traceback (most recent call last):
  File "<ipython-input-7-dc8c99107ec1>", line 11, in download_file
    with urllib.request.urlopen(req) as response:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 531, in open
    response = meth(req, response)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 641, in http_response
    'http', request, response, code, msg, hdrs)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 569

  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 641, in http_response
    'http', request, response, code, msg, hdrs)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 569, in error
    return self._call_chain(*args)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 503, in _call_chain
    result = func(*args)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 649, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 521: Origin Down
Traceback (most recent call last):
  File "<ipython-input-7-dc8c99107ec1>", line 11, in download_file
    with urllib.request.urlopen(req) as response:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 531, in open
    




417 / 438 files downloaded and extracted successfully


In [11]:
# Some downloads have failed, let's retry

items = [item for item, success in result if not success]
print(f'Retrying {len(items)} files', file=sys.stderr)
with cf.ProcessPoolExecutor(15) as pool:
    result2 = list(tqdm(pool.map(download_file, items), total=len(items)))

print('{} / {} files downloaded and extracted successfully'
      .format(sum(success for _, success in result2), len(items)), file=sys.stderr)

Retrying 21 files


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
Traceback (most recent call last):
  File "<ipython-input-7-dc8c99107ec1>", line 12, in download_file
    with zipfile.ZipFile(io.BytesIO(response.read())) as zf:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 470, in read
    s = self._safe_read(self.length)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 622, in _safe_read
    raise IncompleteRead(b''.join(s), amt)
http.client.IncompleteRead: IncompleteRead(450580674 bytes read, 420142232 more e

  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/urllib/request.py", line 649, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 522: Origin Connection Time-out
Traceback (most recent call last):
  File "<ipython-input-7-dc8c99107ec1>", line 12, in download_file
    with zipfile.ZipFile(io.BytesIO(response.read())) as zf:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 470, in read
    s = self._safe_read(self.length)
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 622, in _safe_read
    raise IncompleteRead(b''.join(s), amt)
http.client.IncompleteRead: IncompleteRead(40685674 bytes read, 830037232 more expected)
Traceback (most recent call last):
  File "<ipython-input-7-dc8c99107ec1>", line 12, in download_file
    with zipfile.ZipFile(io.BytesIO(response.read())) as zf:
  File "/tsi/doctorants/ocifka/envs/nst/lib/python3.7/http/client.py", line 470, in read
    




19 / 21 files downloaded and extracted successfully


In [13]:
# Some downloads have failed for the second time, let's retry again

items = [item for item, success in result2 if not success]
print(f'Retrying {len(items)} files', file=sys.stderr)
with cf.ProcessPoolExecutor(15) as pool:
    result3 = list(tqdm(pool.map(download_file, items), total=len(items)))

print('{} / {} files downloaded and extracted successfully'
      .format(sum(success for _, success in result3), len(items)), file=sys.stderr)

Retrying 2 files


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




2 / 2 files downloaded and extracted successfully


In [None]:
# Now we should have all 438 files