# Combine Individual Files and Delete From Cloud Storage

In [None]:
import os
import shutil
from glob import glob
from zipfile import ZipFile

import pandas as pd
from azure.storage.blob import BlobServiceClient

## About

Download all files uploaded to Azure Blob storage and combine files into a single `.parquet.gzip` file for listings and search results depending on whether the data in the file was retrieved using Selenium or the Python `requests` library. Finally, delete all files stored in Azure Blob storage.

## User Inputs

In [None]:
data_dir = "data/processed"

In [None]:
conn_str = (
    "DefaultEndpointsProtocol=https;"
    f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
    f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
    f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
)
blob_name_prefix = os.getenv("BLOB_NAME_PREFIX")
az_container_name = os.getenv("AZURE_CONTAINER_NAME")

In [None]:
folders_dict = {
    80: "listings",
    81: "search_results",
    82: "listings_requests",
    83: "search_results_requests",
}
dtypes_dicts = {
    "listings": {"Publisher": pd.StringDtype(), "Franchise": pd.StringDtype()},
    "listings_requests": {
        "Publisher": pd.StringDtype(),
        "Franchise": pd.StringDtype(),
        "Title": pd.StringDtype(),
        "Release Date": pd.StringDtype(),
        "Developer": pd.StringDtype(),
    }
}

In [None]:
blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)

## Retrieve Files from Azure Cloud Storage and Combine

Download files from Azure Blob storage and combine into single `.parquet.gzip` file for listings and search results retrieved using Selenium or using the Python `requests` library.

Four such files will be produced
- `listings.parquet.gzip`
  - combination of all listings data retrieved using Selenium
- `listings_requests.parquet.gzip`
  - combination of all listings data retrieved using the Python `requests` library
- `search_results.parquet.gzip`
  - combination of all search results retrieved using Selenium
- `search_results_requests.parquet.gzip`
  - combination of all search results retrieved using the Python `requests` library

In [None]:
%%time
for az_blob_name_suffix, file_substring in folders_dict.items():
    data_sub_dir = f"{data_dir}/{file_substring}"
    os.makedirs(data_sub_dir, exist_ok=False)
    local_file_path = f"{data_sub_dir}/combo_batched_{file_substring}.zip"

    print(f"Downloading {file_substring} zipped file to {local_file_path}...")
    blob_client = blob_service_client.get_blob_client(
        container=az_container_name, blob=f"{blob_name_prefix}{az_blob_name_suffix}"
    )
    with open(local_file_path, "wb") as download_file:
        download_stream = blob_client.download_blob()
        download_file.write(download_stream.readall())
    print("Done.")

    print(f"Extracting {file_substring} contents of {local_file_path}...")
    with ZipFile(local_file_path) as zip_ref:
        zip_ref.extractall(data_sub_dir)
        file_type = "search_results" if "search" in file_substring else "listings"
        batched_files_zip = glob(os.path.join(data_sub_dir, f"batched_{file_type}_*.zip"))
        for f_subdir in batched_files_zip:
            with ZipFile(f_subdir) as zip_ref:
                zip_ref.extractall(data_sub_dir)
    print("Done.")

    print(f"Reading all {file_substring} flat files into DataFrame...")
    if "search_results" in file_substring:
        filepaths = glob(os.path.join(data_sub_dir, "search_results_*.parquet.gzip"))
        df = pd.concat(
            [
                pd.read_parquet(f).assign(filename=os.path.basename(f))
                for f in filepaths
            ]
        ).reset_index(drop=True)
    else:
        filepaths = glob(os.path.join(data_sub_dir, "p*_l*_*.csv"))
        df = pd.concat(
            [
                pd.read_csv(f).assign(filename=os.path.basename(f))
                for f in filepaths
            ]
        ).reset_index(drop=True).astype(dtypes_dicts[file_substring])
    print("Done.")
    print(f"Saving combined {file_substring} contents to disk...")
    df.to_parquet(f"{data_dir}/{file_substring}.parquet.gzip", index=False)
    print("Done.")

    print(f"Deleting intermediate {file_substring} files...")
    for fpath in filepaths:
        os.remove(fpath)
    for fpath in batched_files_zip:
        os.remove(fpath)
    print("Done.")

    dest_file_path = f"{data_dir}/combo_batched_{file_substring}.zip"
    print(f"Moving {local_file_path}...")
    shutil.move(local_file_path, dest_file_path)
    print("Done.")

In [None]:
print("Deleting empty sub-directories...")
os.chdir(data_dir)
for _, file_substring in folders_dict.items():
    os.rmdir(file_substring)
print("Done.")

## Delete All Files from Azure Blob Storage

In [None]:
%%time
for az_blob_name_suffix, file_substring in folders_dict.items():
    blob_name = f"{blob_name_prefix}{az_blob_name_suffix}"
    print(f"Deleting {file_substring} zipped file from {blob_name} blob in Azure blob storage...")
    blob_client = blob_service_client.get_blob_client(
        container=az_container_name, blob=blob_name
    )
    blob_client.delete_blob(delete_snapshots='include')
    print("Done.")