## Xview Dataset Download 

This component is designed to download a labeled overhead image dataset, provided a chromedriver, to a specified location. 


In [None]:
!pip install selenium

In [None]:

import os
import shutil
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse


In [None]:

# username for the Xview webpage to authorize login
username = os.environ.get('username')

# password for the Xview webpage to authorize login
password = os.environ.get('password')

# move_to_dir the directory where the dataset should be saved
move_to_dir = os.environ.get('move_to_dir')

# chromedriver_path the directory where the local copy of chromedriver is saved
chromedriver_path = os.environ.get('chromedriver_path')

# max_download_time before timeout, must be ajusted acording to the file size and internet speed
max_download_time = os.environ.get('max_download_time')

# The label of the file desired to download.
# Chose from "TI.zip", "TL.zip", "VI.zip", "TI.tgz", "TL.tgz", "VI.tgz, 
# standing for TI=Traning Images, TL=Training Lables, VI=Validation Images
label = os.environ.get('label')


In [None]:

def login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label):  
    
    # Set Chrome options to automatically download files to the specified directory
    options = webdriver.ChromeOptions()
    prefs = {
        "download.default_directory": move_to_dir,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True
    }
    options.add_experimental_option("prefs", prefs)

    # Start a new instance of Chrome web browser
    driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)
    
    # Open the login page
    url_login = r'https://challenge.xviewdataset.org/login'
    driver.get(url_login)

    # Find the username and password fields and enter credentials
    username_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'email')))
    password_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'password')))
    username_field.send_keys(username)
    password_field.send_keys(password)

    # Find and click the login button
    login_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn.primary')))
    login_button.click()
    
    # Wait for the page to load after login
    time.sleep(1)
    
    # Open the Download page
    url_download = r'https://challenge.xviewdataset.org/download-links'
    driver.get(url_download)
    
    # Wait for the overlay element to be present
    overlay_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'overlay--active')))

    # Remove the automaic pop-up overlay 
    body_element = driver.find_element_by_tag_name('body')
    body_element.click()
    time.sleep(1)
    
    # Switch between the possible download files
    search_text = ""
    match label:
        case "TI.zip":
            search_text = '//a[contains(text(), "Download Training Images (zip)")]'
        case "TL.zip":
            search_text = '//a[contains(text(), "Download Training Labels (zip)")]'
        case "VI.zip":
            search_text = '//a[contains(text(), "Download Validation Images (zip)")]'
        case "TI.tgz":
            search_text = '//a[contains(text(), "Download Training Images (tgz)")]'
        case "TL.tgz":
            search_text = '//a[contains(text(), "Download Training Labels (tgz)")]'
        case "VI.tgz":
            search_text = '//a[contains(text(), "Download Validation Images (tgz)")]'
        case _:
            raise ValueError("Error: This is an invalid download option") 
    
    # Wait for the download link to be present
    download_link_element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, search_text)))
    
    # Get the dynamic download link from the href attribute
    download_link = download_link_element.get_attribute('href')
    
     # Download the dataset using the obtained link
    if download_link:
        driver.get(download_link)
        print("Dataset download started successfully.")
        
        # Extract the filename from the download link URL
        parsed_url = urlparse(download_link)
        filename = parsed_url.path.split('/')[-1]
        downloaded_file = os.path.join(move_to_dir, filename)
        print(downloaded_file)
        
        # Check if the download directory exists
        if not os.path.exists(move_to_dir):
            os.makedirs(move_to_dir)
        
        # Wait for the file to be completely downloaded
        start_time = time.time()
        
        while True:
            if os.path.exists(downloaded_file) and os.path.getsize(downloaded_file) > 0:
                print("File downloaded successfully.")
                break
            elif time.time() - start_time > max_download_time:
                print("Error: Maximum wait time exceeded.")
                break
            else:
                time.sleep(5)
    
    else:
        print("Failed to get the download link.")

    # Close the browser
    driver.quit()


In [None]:
login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label)