# Download data from Cloud Storage

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import os
import re
import time
from glob import glob
from zipfile import ZipFile

from azure.storage.blob import BlobServiceClient
from prefect import Flow, context as prefect_context, task, unmapped

<a href="table-of-contents"></a>

## [Table of Contents](#table-of-contents)
0. [About](#about)
1. [User Inputs](#user-inputs)
2. [Download combined archive from Blob Storage and Extract raw data](#download-combined-archive-from-blob-storage-and-extract-raw-data)
   - 2.1. [Retrieve Selenium-based Listings and Search Results from Cloud Storage](#retrieve-listings-and-search-results-from-cloud-storage)
   - 2.2. [Retrieve Requests-based Listings and Search Results from Cloud Storage](#retrieve-requests-based-listings-and-search-results-from-cloud-storage)

<a id="about"></a>

## 0. [About](#about)

Download all scraped listings and search results from [Azure blob storage](https://azure.microsoft.com/en-us/services/storage/blobs/).

**Requirements**

The following four environment variables should be exported before running this notebook
- `BLOB_NAME_PREFIX`
  - for each combined archive to be uploaded (listings and search results), a unique number will be appended to this value
- `AZURE_STORAGE_ACCOUNT`
- `AZURE_STORAGE_KEY`
- `ENDPOINT_SUFFIX`

**Notes**

1. The workflow used here is executed using a workflow management tool. This was optional, but has been used here.

<a id="user-inputs"></a>

## 1. [User Inputs](#user-inputs)

In [None]:
PROJ_ROOT_DIR = os.getcwd()

In [None]:
# Suffixes (of filenames) to upload combined archives of listings and
# search results to Azure blob storage
blob_name_suffixes = {
    "listings": 80,
    "search_results": 81,
    "listings_requests": 82,
    "search_results_requests": 83,
}

In [None]:
data_dir = os.path.join(PROJ_ROOT_DIR, "data")
raw_data_dir = os.path.join(data_dir, "raw")
requests_files_dir = os.path.join(raw_data_dir, "requests")
selenium_files_dir = os.path.join(raw_data_dir, "selenium")

conn_str = (
    "DefaultEndpointsProtocol=https;"
    f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
    f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
    f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
)
blob_name_prefix = os.getenv("BLOB_NAME_PREFIX")

# (For downloading previously scraped data from Azure cloud storage)
# Create paths to data/raw/selenium and/or data/raw/requests
if not os.path.exists(selenium_files_dir):
    os.mkdir(selenium_files_dir)
if not os.path.exists(requests_files_dir):
    os.mkdir(requests_files_dir)

In [None]:
@task
def download_az_file_blobs(blob_names_dict, conn_str, az_container_name="myconedesx7"):
    logger = prefect_context.get("logger")
    # print(blob_names_dict)
    downloaded_blobs = []
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)
    for az_blob_name, local_file_path in blob_names_dict.items():
        blob_client = blob_service_client.get_blob_client(
            container=az_container_name, blob=az_blob_name
        )
        # print(blob_client, local_file_path)
        local_filename = os.path.basename(local_file_path)
        if not os.path.exists(local_file_path):
            with open(local_file_path, "wb") as download_file:
                download_stream = blob_client.download_blob()
                download_file.write(download_stream.readall())
            downloaded_blobs.append(local_file_path)
            logger.info(
                f"Blob {az_blob_name} not found at {local_filename}. Downloaded to {local_filename}."
            )
        else:
            logger.info(
                f"Blob {az_blob_name} found at {local_filename}. Did not download to {local_filename}."
            )
    return downloaded_blobs


@task
def unarchive(file_name, data_dir, search_str, flatten_filepaths=True):
    logger = prefect_context.get("logger")
    if file_name:
        if flatten_filepaths:
            file_name = file_name[0]
            # print(file_name)
        with ZipFile(file_name) as zip_ref:
            zip_ref.extractall(data_dir)
        logger.info(
            f"Unarchived contents of {os.path.basename(file_name)} to "
            f"{os.path.split(data_dir)[-1]}"
        )
    else:
        logger.info(f"Got empty archive name. Did not unarchive")
    return glob(os.path.join(data_dir, search_str))

<a id="download-combined-archive-from-blob-storage-and-extract-raw-data"></a>

## 2. [Download combined archive from Blob Storage and Extract raw data](#download-combined-archive-from-blob-storage-and-extract-raw-data)

<a id="retrieve-selenium-based-listings-and-search-results-from-cloud-storage"></a>

### 2.1. [Retrieve Selenium-based Listings and Search Results from Cloud Storage](#retrieve-listings-and-search-results-from-cloud-storage)

Change into the sub-directory where listings and search results scraped with `selenium` will be downloaded from the cloud

In [None]:
os.chdir(selenium_files_dir)
print(f"Current working directory is {os.getcwd()}")

Download the combined archives for batched listings and search results from Azure blob storage and extract the contents

In [None]:
%%time
with Flow("Download and Unarchive listings and search results scraped with selenium") as flow:
    for file_type in ["listings", "search_results"]:
        combo_archive = download_az_file_blobs(
            {f"{blob_name_prefix}{blob_name_suffixes[file_type]}": os.path.join(selenium_files_dir, f"combo_batched_{file_type}.zip")},
            conn_str,
        )
        unarchived_archives = unarchive(
            combo_archive, selenium_files_dir, f"batched_{file_type}_*.zip", True
        )
        unarchived_files = unarchive.map(
            unarchived_archives,
            unmapped(selenium_files_dir),
            unmapped(f"{file_type}_*.parquet.gzip"),
            unmapped(False),
        )

state_query = flow.run()

<a id="retrieve-requests-based-listings-and-search-results-from-cloud-storage"></a>

### 2.2. [Retrieve Requests-based Listings and Search Results from Cloud Storage](#retrieve-requests-based-listings-and-search-results-from-cloud-storage)

Change into the sub-directory where listings and search results scraped with `requests` will be downloaded from the cloud

In [None]:
os.chdir(requests_files_dir)
print(f"Current working directory is {os.getcwd()}")

Download the combined archives for batched listings and search results from Azure blob storage and extract the contents

In [None]:
%%time
with Flow("Download and Unarchive listings and search results scraped with requests") as flow:
    for file_type in ["listings", "search_results"]:
        combo_archive = download_az_file_blobs(
            {f"{blob_name_prefix}{blob_name_suffixes[file_type+'_requests']}": os.path.join(requests_files_dir, f"combo_batched_{file_type}.zip")},
            conn_str,
        )
        unarchived_archives = unarchive(
            combo_archive, requests_files_dir, f"batched_{file_type}_*.zip", True
        )
        unarchived_files = unarchive.map(
            unarchived_archives,
            unmapped(requests_files_dir),
            unmapped(f"{file_type}_*.parquet.gzip"),
            unmapped(False),
        )

state_query = flow.run()

---

<span style="float:left">
    <a href="./8_upload_cloud.ipynb"><< 8 - Upload all scraped data (with requests and selenium) to cloud storage</a>
</span>

<span style="float:right">
    <a href="./10_selenium_to_db.ipynb">10 - Clean and merge selenium search results and listings and append to MySQL database>></a>
</span>