In [1]:
import csv
import getpass
import hashlib
import io
import os
from pathlib import Path
import requests
import sys
import shlex
import traceback

import pandas as pd
import regex as re
from slugify import slugify
from tqdm.auto import tqdm

In [None]:
username = input("Movebank username: ")
password = getpass.getpass("Movebank password: ")
curl_cmd = "curl -u " + shlex.quote(f"{username}:{password}")

In [3]:
!{curl_cmd} -o studies.csv 'https://www.movebank.org/movebank/service/direct-read?entity_type=study'

studies_all = pd.read_csv("studies.csv", dtype=str)
studies_all.sort_values(by="id", key=pd.to_numeric, inplace=True)
studies_all.to_csv("studies.csv", index=False)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3491k    0 3491k    0     0  2725k      0 --:--:--  0:00:01 --:--:-- 2723k


In [4]:
!{curl_cmd} -o sensor_types.csv 'https://www.movebank.org/movebank/service/direct-read?entity_type=tag_type'

sensor_types = pd.read_csv("sensor_types.csv")
sensor_type_map = dict(zip(sensor_types["id"], sensor_types["external_id"]))
location_sensor_type_ids = set(sensor_types[sensor_types["is_location_sensor"]]["id"])

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   831  100   831    0     0   2533      0 --:--:-- --:--:-- --:--:--  2541


In [5]:
studies_downloadable = studies_all[
    (studies_all["i_have_download_access"] == "true") &
    (studies_all["there_are_data_which_i_cannot_see"] != "true")]
studies_downloadable.to_csv("studies_downloadable.csv", index=False)
len(studies_downloadable)

603

In [6]:
studies_permissive = studies_downloadable[studies_downloadable["license_type"] != "CUSTOM"]
studies_permissive.to_csv("studies_permissive.csv", index=False)
len(studies_permissive)

476

In [7]:
sorted(set(studies_permissive["license_type"]))

['CC_0', 'CC_BY', 'CC_BY_NC']

In [8]:
def get_study_dir_name(study):
    pi_name = study["principal_investigator_name"]
    if pd.isna(pi_name):
        pi_name = ""
        return str(study["id"])
    if "(" in pi_name:
        slug = slugify(pi_name.split("(")[0])
    else:
        slug = slugify(re.split(r"[^\w\s.]+", pi_name)[0]).split("-")[-1]
    return "{}-{}".format(study["id"], slug)

In [9]:
def get_data(cookies=None, stream=False, headers=None, **params):
    return requests.get(
        'https://www.movebank.org/movebank/service/direct-read',
        params=params, auth=(username, password), cookies=cookies, stream=stream, headers=headers)

In [10]:
def validate_response(response):
    response.raise_for_status()
    assert response.headers.get("accept-license") != "true"
    assert response.headers["content-type"] == "text/csv"

In [11]:
def save_data(study_dir, entity_type, response):
    # Make sure response is OK
    if response.status_code == requests.codes.server_error:
        tqdm.write(f"Server error for {study_dir}, {entity_type}")
        with open(study_dir / f"{entity_type}.error.html", "wb") as f:
            f.write(response.content)
        return
    validate_response(response)

    # Save the data
    path = study_dir / f"{entity_type}.csv"
    with open(path.with_suffix(".csv.tmp"), "wb") as f:
        f.write(response.content)
    path.with_suffix(".csv.tmp").replace(path)

In [None]:
pbar = tqdm(studies_permissive.iterrows(), total=len(studies_permissive))
for _, study in pbar:
    study_id = int(study["id"])
    study_dir = Path("studies") / get_study_dir_name(study)
    os.makedirs(study_dir, exist_ok=True)
    pbar.set_postfix_str(study_dir)

    if not all((study_dir / f"{e}.csv").exists() for e in ["tag", "individual", "deployment", "sensor"]):
        # Save and accept license if necessary
        response = get_data(entity_type="tag", study_id=study_id)
        params = {}
        if response.headers.get("accept-license") == "true":
            with open(study_dir / "license.html", "wb") as f:
                f.write(response.content)
            license_hash = hashlib.md5(response.content).hexdigest()  #.lstrip("0")
            params = {"cookies": response.cookies, "license-md5": license_hash}
            response = get_data(entity_type="tag", study_id=study_id, **params)
            if response.status_code != requests.codes.ok:
                raise RuntimeError(f"Failed to accept license for {study_dir}: error {response.status_code}")

        del response

        # Download metadata
        save_data(study_dir, "tag",
                  get_data(entity_type="tag", study_id=study_id, **params))
        save_data(study_dir, "individual",
                  get_data(entity_type="individual", study_id=study_id, **params))
        save_data(study_dir, "deployment",
                  get_data(entity_type="deployment", study_id=study_id, **params))
        save_data(study_dir, "sensor",
                  get_data(entity_type="sensor", tag_study_id=study_id, **params))

    # Get data from all location sensors and save it
    path = study_dir / "location.csv"
    if not path.exists():
        sensors = pd.read_csv(study_dir / "sensor.csv")
        writer = None
        with open(path.with_suffix(".csv.tmp"), "w", newline="") as f:
            for sensor_type_id in tqdm(set(sensors["sensor_type_id"]) & location_sensor_type_ids, leave=False):
                response = get_data(
                    entity_type="event", study_id=study_id, sensor_type_id=sensor_type_id,
                    attributes="timestamp,location_lat,location_long,individual_id,deployment_id,tag_id,visible",
                    stream=True, **params)
                validate_response(response)

                # Read the response row by row and write it to the file
                lines = (line.decode("utf-8") for line in response.iter_lines())
                reader = csv.DictReader(lines)
                if not writer:
                    writer = csv.DictWriter(f, fieldnames=reader.fieldnames + ["sensor_type"])
                    writer.writeheader()
                for row in tqdm(reader, leave=False):
                    row["sensor_type"] = sensor_type_map[sensor_type_id]
                    writer.writerow(row)
        path.with_suffix(".csv.tmp").replace(path)

In [13]:
!du -sh studies

20G	studies
