# Upload data to Cloud Storage

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import time
from glob import glob
from zipfile import ZipFile

from azure.storage.blob import BlobServiceClient
from prefect import Flow, context as prefect_context, task, unmapped

<a href="table-of-contents"></a>

## [Table of Contents](#table-of-contents)
0. [About](#about)
1. [User Inputs](#user-inputs)
2. [Create archives from scraped files and Upload to Blob Storage](#create-archives-from-scraped-files-and-upload-to-blob-storage)
   - 2.1. [Listings](#listings)
   - 2.2. [Search Results](#search-results)

<a id="about"></a>

## 0. [About](#about)

All scraped single-row CSV files of listings and upto 25-row CSV files of search results are combined into batches, of a desired size and uploaded to [Azure blob storage](https://azure.microsoft.com/en-us/services/storage/blobs/).

A batch of size 12 for the search results will mean that every 12 pages (each page has upto 25 rows of search results) of scraped CSV files will be archived together. A batch of size 1 for the listings will mean that every 1 row of scraped CSV listings will be archived together.

**Requirements**

The following four environment variables should be exported before running this notebook
- `BLOB_NAME_PREFIX`
  - for each combined archive to be uploaded (listings and search results), a unique number will be appended to this value
- `AZURE_STORAGE_ACCOUNT`
- `AZURE_STORAGE_KEY`
- `ENDPOINT_SUFFIX`

**Notes**
1. This should be done for the raw scraped data only. In this way, subsequent runs of the exploratory data analysis phase of this project can use the following workflow that does not require web-scraping to access any previously downloaded data
   - download this previously scraped data from cloud storage (`9_*.ipynb`)
   - cleaning/merging using `6_merge_searches_listings.ipynb`
   - exploring the data with `7_eda.ipynb`

   In this way, the notebooks responsible for web-scraping (`0_*.ipynb` to `5_*.ipynb`) will not be run.
2. The workflow used here is executed using a workflow management tool. This was optional, but has been used here.

<a id="user-inputs"></a>

## 1. [User Inputs](#user-inputs)

In [3]:
PROJ_ROOT_DIR = os.getcwd()

In [4]:
# Last page number for listings or search results to be archived and uploaded
max_page_num_to_check = 50 + 1

min_page_num_to_check_requests = 50
max_page_num_to_check_requests = 525
max_page_num_to_check_sr_requests = 2235 + 1

# Size of batch of CSV files of scraped listings
page_batch_size = 5
page_batch_size_requests = 25

# Size of batch of CSV files of scraped search results
page_batch_size_sr = 10
page_batch_size_sr_requests = 25

# Suffixes (of filenames) to upload combined archives of listings and
# search results to Azure blob storage
blob_name_suffixes = {
    "listings": 80,
    "search_results": 81,
    "listings_requests": 82,
    "search_results_requests": 83,
}

In [5]:
data_dir = os.path.join(PROJ_ROOT_DIR, "data")
raw_data_dir = os.path.join(data_dir, "raw")
requests_files_dir = os.path.join(raw_data_dir, "requests")
selenium_files_dir = os.path.join(raw_data_dir, "selenium")

conn_str = (
    "DefaultEndpointsProtocol=https;"
    f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
    f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
    f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
)
blob_name_prefix = os.getenv("BLOB_NAME_PREFIX")

# Selenium - LISTINGS
# - listings on search results pages 2-49 were scraped
# - this list should cover pages 1 to 50, inclusive
# - with a step size of 5, every 5 pages (25 listings per page) will be placed in a separate batch (5, 10, 15, 20, etc.)
d = [
    k + page_batch_size - 1
    for k in range(1, (max_page_num_to_check - 1) + 1, page_batch_size)
]
print(d)

# Selenium - SEARCH RESULTS
# - pages 2-49 of search results were scraped
# - this list should cover pages 1 to 50, inclusive
# - with a step size of 10, every 10 pages (25 listings per page) will be placed in a separate batch
#   - the first batch will cover pages 1 to 10, the second batch covers pages 11 - 20, etc.
d_sr = [
    k + page_batch_size_sr - 1
    for k in range(
        1,
        (max_page_num_to_check - 1) + 1,
        page_batch_size_sr,
    )
]
print(d_sr)

# Requests - LISTINGS
# - listings on search results pages 50-505 were scraped
# - this list should cover pages 50 to 505, inclusive
# - with a step size of 25, every 25 pages (25 listings per page) will be placed in a separate batch (74, 99, 124)
d_requests = [
    k + page_batch_size_requests - 1
    for k in range(
        min_page_num_to_check_requests,
        (max_page_num_to_check_requests - 1) + 1,
        page_batch_size_requests,
    )
]
print(d_requests)

# Requests - SEARCH RESULTS
# - pages 50-2232 of search results were scraped
# - this list should cover pages 50 to 2250, inclusive
# - with a step size of 25, every 25 pages (25 listings per page) will be placed in a separate batch
#   - the first batch will cover pages 50 to (50 + 25) - 1 = 74, the second batch covers pages 75 - 99, etc.
d_sr_requests = [
    k + page_batch_size_sr_requests - 1
    for k in range(
        min_page_num_to_check_requests,
        (max_page_num_to_check_sr_requests - 1) + 1,
        page_batch_size_sr_requests,
    )
]
print(d_sr_requests)

[5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
[10, 20, 30, 40, 50]
[74, 99, 124, 149, 174, 199, 224, 249, 274, 299, 324, 349, 374, 399, 424, 449, 474, 499, 524]
[74, 99, 124, 149, 174, 199, 224, 249, 274, 299, 324, 349, 374, 399, 424, 449, 474, 499, 524, 549, 574, 599, 624, 649, 674, 699, 724, 749, 774, 799, 824, 849, 874, 899, 924, 949, 974, 999, 1024, 1049, 1074, 1099, 1124, 1149, 1174, 1199, 1224, 1249, 1274, 1299, 1324, 1349, 1374, 1399, 1424, 1449, 1474, 1499, 1524, 1549, 1574, 1599, 1624, 1649, 1674, 1699, 1724, 1749, 1774, 1799, 1824, 1849, 1874, 1899, 1924, 1949, 1974, 1999, 2024, 2049, 2074, 2099, 2124, 2149, 2174, 2199, 2224, 2249]


In [8]:
@task
def get_files_single_page(
    page_num_max,
    search_results_file_list,
    prefix="p",
    file_type="listings",
    step_size=100,
):
    """Create archive."""
    logger = prefect_context.get("logger")
    matching_filepaths = []
    fname = (
        f"batched_{file_type}__{page_num_max-(step_size-1)}_"
        f"{page_num_max}__{time.strftime('%Y%m%d_%H%M%S')}.zip"
    )
    if not os.path.exists(fname):
        for f in search_results_file_list:
            p_str = re.search(fr"{prefix}(\d+)_", f).group(0)
            p_num_str = (
                p_str.replace("_", "").replace("p", "")
                if prefix == "p"
                else p_str.split("_")[-2]
            )
            if step_size == 1:
                if int(p_num_str) == page_num_max:
                    matching_filepaths.append(f)
            else:
                if (page_num_max - step_size) + 1 <= int(p_num_str) <= page_num_max:
                    # print(f)
                    matching_filepaths.append(f)
    logger.info(
        f"Page start = {page_num_max - (step_size - 1)}, "
        f"Page stop = {page_num_max}, "
        f"Files Found = {len(matching_filepaths)}"
    )
    return {fname: matching_filepaths}


@task
def create_archive(files_dict):
    logger = prefect_context.get("logger")
    archives_created = []
    for archive_fname, matching_filepaths in files_dict.items():
        if len(matching_filepaths) > 0:
            with ZipFile(archive_fname, "w") as zip_file:
                for matching in matching_filepaths:
                    zip_file.write(matching)
            archives_created.append(archive_fname)
            logger.info(f"Creating archive {archive_fname}")
        else:
            pass
            logger.info(
                f"Did not find any filepaths for {archive_fname}. "
                "Will not create archive."
            )
    return archives_created


@task
def upload_az_file_blobs(
    blob_names_dict, conn_str, flatten_filepaths=True, az_container_name="myconedesx7"
):
    logger = prefect_context.get("logger")
    if flatten_filepaths:
        blob_names_dict = {k: l for k, v in blob_names_dict.items() for l in v}
    # print(type(blob_names_dict), blob_names_dict)
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)
    for az_blob_name, local_file_path in blob_names_dict.items():
        blob_client = blob_service_client.get_blob_client(
            container=az_container_name, blob=az_blob_name
        )
        # print(az_blob_name, local_file_path)
        if not list(
            blob_service_client.get_container_client(az_container_name).list_blobs(
                name_starts_with=az_blob_name
            )
        ):
            with open(local_file_path, "rb") as data:
                blob_client.upload_blob(data)
            logger.info(f"Blob {az_blob_name} not found. Uploaded {local_file_path}.")
        else:
            logger.info(f"Blob {az_blob_name} found. Did not upload {local_file_path}.")

<a id="create-archives-from-scraped-files-and-upload-to-blob-storage"></a>

## 2. [Create archives from scraped files and Upload to Blob Storage](#create-archives-from-scraped-files-and-upload-to-blob-storage)

<a id="selenium-listings"></a>

### 2.1. [Selenium Listings](#selenium-listings)

Change into the sub-directory containing the downloaded single-row CSV files with the scraped contents of each listing scraped with `requests`

In [9]:
os.chdir(selenium_files_dir)
print(f"Current working directory is {os.getcwd()}")

Current working directory is /home/edesz/Downloads/web-scraping/data/raw/selenium


Create an archive of batches of listings files, where the size of each batch is the difference between successive elements in the list `d` defined earlier

In [10]:
with Flow("Archive Listings Scraped with Selenium") as flow:
    files = get_files_single_page.map(
        d,
        unmapped(glob("*.csv")),
        unmapped("p"),
        unmapped("listings"),
        unmapped(d[1] - d[0]),
    )
    _ = create_archive.map(files)

state = flow.run()

[2021-10-26 23:28:13-0400] INFO - prefect.FlowRunner | Beginning Flow run for 'Archive Listings Scraped with Selenium'
[2021-10-26 23:28:13-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page': Starting task run...
[2021-10-26 23:28:13-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page': Finished task run for task with final state: 'Mapped'
[2021-10-26 23:28:13-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[0]': Starting task run...
[2021-10-26 23:28:13-0400] INFO - prefect.get_files_single_page[0] | Page start = 1, Page stop = 5, Files Found = 93
[2021-10-26 23:28:13-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[0]': Finished task run for task with final state: 'Success'
[2021-10-26 23:28:13-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[1]': Starting task run...
[2021-10-26 23:28:13-0400] INFO - prefect.get_files_single_page[1] | Page start = 6, Page stop = 10, Files Found = 107
[2021-10-26 23:28:13-0400] INFO - prefe

Create a single archive with the combination of all batched listings archives created above and upload to Azure blob storage

In [11]:
with Flow("Combine and Upload Selenium-Based Listings Archives") as flow:
    combined_archive_fname = (
        f"combo_batched_listings__{time.strftime('%Y%m%d_%H%M%S')}.zip"
    )
    combo_archive = create_archive(
        {combined_archive_fname: glob("batched_listings__*.zip")}
    )
    upload_az_file_blobs(
        {f"{blob_name_prefix}{blob_name_suffixes['listings']}": combo_archive},
        conn_str,
        True,
    )

state_combo = flow.run()
# state_combo.result[combo_archive].result

[2021-10-26 23:28:22-0400] INFO - prefect.FlowRunner | Beginning Flow run for 'Combine and Upload Selenium-Based Listings Archives'
[2021-10-26 23:28:22-0400] INFO - prefect.TaskRunner | Task 'create_archive': Starting task run...
[2021-10-26 23:28:22-0400] INFO - prefect.create_archive | Creating archive combo_batched_listings__20211026_232822.zip
[2021-10-26 23:28:22-0400] INFO - prefect.TaskRunner | Task 'create_archive': Finished task run for task with final state: 'Success'
[2021-10-26 23:28:22-0400] INFO - prefect.TaskRunner | Task 'List': Starting task run...
[2021-10-26 23:28:22-0400] INFO - prefect.TaskRunner | Task 'List': Finished task run for task with final state: 'Success'
[2021-10-26 23:28:22-0400] INFO - prefect.TaskRunner | Task 'Dict': Starting task run...
[2021-10-26 23:28:22-0400] INFO - prefect.TaskRunner | Task 'Dict': Finished task run for task with final state: 'Success'
[2021-10-26 23:28:22-0400] INFO - prefect.TaskRunner | Task 'upload_az_file_blobs': Starting

**Note**
1. It seems like this has to be called after the above `Flow`, since Prefect can't handle re-using the same task (here, this would be `create_archive()`) with different inputs ([link](https://github.com/PrefectHQ/prefect/issues/4603)) while also having the second run of the task dependent on the output of the first run.
2. An attempt at capturing the output of the first run of the task and feeding it into the second run (thereby forcing the second run to be dependent on the first, by replacing `glob("batched_listings__*.zip")`) did not work. Future work should further investigate this as a workaround for the above.

<a id="requests-listings"></a>

### 2.2. [Requests Listings](#requests-listings)

Change into the sub-directory containing the downloaded single-row CSV files with the scraped contents of each listing scraped with Selenium

In [12]:
os.chdir(requests_files_dir)
print(f"Current working directory is {os.getcwd()}")

Current working directory is /home/edesz/Downloads/web-scraping/data/raw/requests


Create an archive of batches of listings files, where the size of each batch is the difference between successive elements in the list `d_requests` defined earlier

In [13]:
with Flow("Archive Listings Scraped with Requests") as flow:
    files = get_files_single_page.map(
        d_requests,
        unmapped(glob("p*.csv")),
        unmapped("p"),
        unmapped("listings"),
        unmapped(d_requests[1] - d_requests[0]),
    )
    _ = create_archive.map(files)

state = flow.run()

[2021-10-26 23:28:35-0400] INFO - prefect.FlowRunner | Beginning Flow run for 'Archive Listings Scraped with Requests'
[2021-10-26 23:28:35-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page': Starting task run...
[2021-10-26 23:28:35-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page': Finished task run for task with final state: 'Mapped'
[2021-10-26 23:28:35-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[0]': Starting task run...
[2021-10-26 23:28:35-0400] INFO - prefect.get_files_single_page[0] | Page start = 50, Page stop = 74, Files Found = 613
[2021-10-26 23:28:35-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[0]': Finished task run for task with final state: 'Success'
[2021-10-26 23:28:35-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[1]': Starting task run...
[2021-10-26 23:28:35-0400] INFO - prefect.get_files_single_page[1] | Page start = 75, Page stop = 99, Files Found = 615
[2021-10-26 23:28:35-0400] INFO - p

Create a single archive with the combination of all batched listings archives created above and upload to Azure blob storage

In [14]:
with Flow("Combine and Upload Requests-Based Listings Archives") as flow:
    combined_archive_fname = (
        f"combo_batched_listings__{time.strftime('%Y%m%d_%H%M%S')}.zip"
    )
    combo_archive = create_archive(
        {combined_archive_fname: glob("batched_listings__*.zip")}
    )
    upload_az_file_blobs(
        {f"{blob_name_prefix}{blob_name_suffixes['listings_requests']}": combo_archive},
        conn_str,
        True,
    )

state_combo = flow.run()
# state_combo.result[combo_archive].result

[2021-10-26 23:29:00-0400] INFO - prefect.FlowRunner | Beginning Flow run for 'Combine and Upload Requests-Based Listings Archives'
[2021-10-26 23:29:00-0400] INFO - prefect.TaskRunner | Task 'create_archive': Starting task run...
[2021-10-26 23:29:00-0400] INFO - prefect.create_archive | Creating archive combo_batched_listings__20211026_232900.zip
[2021-10-26 23:29:00-0400] INFO - prefect.TaskRunner | Task 'create_archive': Finished task run for task with final state: 'Success'
[2021-10-26 23:29:00-0400] INFO - prefect.TaskRunner | Task 'List': Starting task run...
[2021-10-26 23:29:00-0400] INFO - prefect.TaskRunner | Task 'List': Finished task run for task with final state: 'Success'
[2021-10-26 23:29:00-0400] INFO - prefect.TaskRunner | Task 'Dict': Starting task run...
[2021-10-26 23:29:00-0400] INFO - prefect.TaskRunner | Task 'Dict': Finished task run for task with final state: 'Success'
[2021-10-26 23:29:00-0400] INFO - prefect.TaskRunner | Task 'upload_az_file_blobs': Starting

<a id="selenium-search-results"></a>

### 2.3. [Selenium Search Results](#selenium-search-results)

Change into the sub-directory containing the downloaded single-row CSV files with the scraped contents of each listing scraped with Selenium

In [15]:
os.chdir(selenium_files_dir)
print(f"Current working directory is {os.getcwd()}")

Current working directory is /home/edesz/Downloads/web-scraping/data/raw/selenium


Create an archive of batches of search results files, where the size of each batch is the difference between successive elements in the list `d_sr` defined earlier

In [16]:
%%time
with Flow("Archive Search Results Pages Scraped with Selenium") as flow:
    files = get_files_single_page.map(
        d_sr,
        unmapped(glob("*.parquet.gzip")),
        unmapped("search_results_page_"),
        unmapped("search_results"),
        unmapped(d_sr[1] - d_sr[0]),
    )
    _ = create_archive.map(files)

state_sr = flow.run()

[2021-10-26 23:29:27-0400] INFO - prefect.FlowRunner | Beginning Flow run for 'Archive Search Results Pages Scraped with Selenium'
[2021-10-26 23:29:27-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page': Starting task run...
[2021-10-26 23:29:27-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page': Finished task run for task with final state: 'Mapped'
[2021-10-26 23:29:27-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[0]': Starting task run...
[2021-10-26 23:29:27-0400] INFO - prefect.get_files_single_page[0] | Page start = 1, Page stop = 10, Files Found = 9
[2021-10-26 23:29:27-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[0]': Finished task run for task with final state: 'Success'
[2021-10-26 23:29:27-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[1]': Starting task run...
[2021-10-26 23:29:27-0400] INFO - prefect.get_files_single_page[1] | Page start = 11, Page stop = 20, Files Found = 10
[2021-10-26 23:29:27-0400] 

Create a single archive with the combination of all batched search results archives created above and upload to Azure blob storage

In [17]:
%%time
with Flow("Combine and Upload Selenium-Based Search Results Archives") as flow:
    combined_archive_fname = (
        f"combo_batched_search_results__{time.strftime('%Y%m%d_%H%M%S')}.zip"
    )
    combo_archive = create_archive({combined_archive_fname: glob("batched_search_results__*.zip")})
    upload_az_file_blobs(
        {f"{blob_name_prefix}{blob_name_suffixes['search_results']}": combo_archive}, conn_str, True
    )

state_sr_combo = flow.run()
# state_sr_combo.result[combo_archive].result

[2021-10-26 23:29:46-0400] INFO - prefect.FlowRunner | Beginning Flow run for 'Combine and Upload Selenium-Based Search Results Archives'
[2021-10-26 23:29:46-0400] INFO - prefect.TaskRunner | Task 'create_archive': Starting task run...
[2021-10-26 23:29:46-0400] INFO - prefect.create_archive | Creating archive combo_batched_search_results__20211026_232946.zip
[2021-10-26 23:29:46-0400] INFO - prefect.TaskRunner | Task 'create_archive': Finished task run for task with final state: 'Success'
[2021-10-26 23:29:46-0400] INFO - prefect.TaskRunner | Task 'List': Starting task run...
[2021-10-26 23:29:46-0400] INFO - prefect.TaskRunner | Task 'List': Finished task run for task with final state: 'Success'
[2021-10-26 23:29:46-0400] INFO - prefect.TaskRunner | Task 'Dict': Starting task run...
[2021-10-26 23:29:46-0400] INFO - prefect.TaskRunner | Task 'Dict': Finished task run for task with final state: 'Success'
[2021-10-26 23:29:46-0400] INFO - prefect.TaskRunner | Task 'upload_az_file_blob

<a id="requests-search-results"></a>

### 2.4. [Requests Search Results](#requests-search-results)

Change into the sub-directory containing the downloaded single-row CSV files with the scraped contents of each listing scraped with `requests`

In [18]:
os.chdir(requests_files_dir)
print(f"Current working directory is {os.getcwd()}")

Current working directory is /home/edesz/Downloads/web-scraping/data/raw/requests


Create an archive of batches of search results files, where the size of each batch is the difference between successive elements in the list `d_sr` defined earlier

In [19]:
%%time
with Flow("Archive Search Results Pages Scraped with Requests") as flow:
    files = get_files_single_page.map(
        d_sr_requests,
        unmapped(glob("search_results_page_*.parquet.gzip")),
        unmapped("search_results_page_"),
        unmapped("search_results"),
        unmapped(d_sr_requests[1] - d_sr_requests[0]),
    )
    _ = create_archive.map(files)

state_sr = flow.run()

[2021-10-26 23:30:03-0400] INFO - prefect.FlowRunner | Beginning Flow run for 'Archive Search Results Pages Scraped with Requests'
[2021-10-26 23:30:03-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page': Starting task run...
[2021-10-26 23:30:03-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page': Finished task run for task with final state: 'Mapped'
[2021-10-26 23:30:03-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[0]': Starting task run...
[2021-10-26 23:30:03-0400] INFO - prefect.get_files_single_page[0] | Page start = 50, Page stop = 74, Files Found = 25
[2021-10-26 23:30:03-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[0]': Finished task run for task with final state: 'Success'
[2021-10-26 23:30:03-0400] INFO - prefect.TaskRunner | Task 'get_files_single_page[1]': Starting task run...
[2021-10-26 23:30:03-0400] INFO - prefect.get_files_single_page[1] | Page start = 75, Page stop = 99, Files Found = 25
[2021-10-26 23:30:03-0400

Create a single archive with the combination of all batched search results archives created above and upload to Azure blob storage

In [20]:
%%time
with Flow("Combine and Upload Requests-Based Search Results Archives") as flow:
    combined_archive_fname = (
        f"combo_batched_search_results__{time.strftime('%Y%m%d_%H%M%S')}.zip"
    )
    combo_archive = create_archive({combined_archive_fname: glob("batched_search_results__*.zip")})
    upload_az_file_blobs(
        {f"{blob_name_prefix}{blob_name_suffixes['search_results_requests']}": combo_archive}, conn_str, True
    )

state_sr_combo = flow.run()
# state_sr_combo.result[combo_archive].result

[2021-10-26 23:30:46-0400] INFO - prefect.FlowRunner | Beginning Flow run for 'Combine and Upload Requests-Based Search Results Archives'
[2021-10-26 23:30:46-0400] INFO - prefect.TaskRunner | Task 'create_archive': Starting task run...
[2021-10-26 23:30:46-0400] INFO - prefect.create_archive | Creating archive combo_batched_search_results__20211026_233046.zip
[2021-10-26 23:30:46-0400] INFO - prefect.TaskRunner | Task 'create_archive': Finished task run for task with final state: 'Success'
[2021-10-26 23:30:46-0400] INFO - prefect.TaskRunner | Task 'List': Starting task run...
[2021-10-26 23:30:46-0400] INFO - prefect.TaskRunner | Task 'List': Finished task run for task with final state: 'Success'
[2021-10-26 23:30:46-0400] INFO - prefect.TaskRunner | Task 'Dict': Starting task run...
[2021-10-26 23:30:46-0400] INFO - prefect.TaskRunner | Task 'Dict': Finished task run for task with final state: 'Success'
[2021-10-26 23:30:46-0400] INFO - prefect.TaskRunner | Task 'upload_az_file_blob

---

<span style="float:left">
    <a href="./7_eda.ipynb"><< 7 - Exploratory Data Analysis</a>
</span>

<span style="float:right">
    <a href="./9_download_cloud.ipynb">9 - Download all scraped data (with requests and selenium) from cloud storage >></a>
</span>