
# OpenAI Caribbean Challenge Ingest

* Runtime: ~1 min
* Compute: 2 GB memory

In [0]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import requests
import os
import json
from loguru import logger
from typing import Literal
from smart_open import open

from src.utils.config import LOCAL_DATA_DIR, S3_DATA_DIR

In [0]:
DESTINATION: Literal["LOCAL", "S3"] = "S3"

DATASET_PREFIX: str = "/openai_caribbean/submission_github_data"

BASE_DIR: str = (
    LOCAL_DATA_DIR + DATASET_PREFIX
    if DESTINATION == "LOCAL"
    else S3_DATA_DIR + DATASET_PREFIX
    if DESTINATION == "S3"
    else None
)

print(f"{BASE_DIR=}")

assert all([BASE_DIR]), "Download Destination not Set"

BASE_DIR='s3://alivio/datasets/openai_caribbean/submission_github_data'


In [0]:
logger.info("Downloading OpenAI Caribbean Challenge Dataset")
base_url: str = "https://raw.githubusercontent.com/drivendataorg/open-ai-caribbean/main/1st%20Place/data"
train_url: str = base_url + "/train.geojson"
test_url: str = base_url + "/test.geojson"

train_response: dict = requests.get(train_url).json()
test_response: dict = requests.get(test_url).json()

2024-02-10 06:13:42.897 | INFO     | __main__:<module>:1 - Downloading OpenAI Caribbean Challenge Dataset


In [0]:
if DESTINATION == "LOCAL":
    os.makedirs(BASE_DIR, exist_ok=True)

with open(BASE_DIR + "/train.geojson", "w") as f:
    json.dump(train_response, f, indent=4)

with open(BASE_DIR + "/test.geojson", "w") as f:
    json.dump(test_response, f, indent=4)

logger.info(f"OpenAI Caribbean Challenge Dataset saved to {BASE_DIR}")

2024-02-10 06:13:46.363 | INFO     | __main__:<module>:10 - OpenAI Caribbean Challenge Dataset saved to s3://alivio/datasets/openai_caribbean/submission_github_data
