## Imports

In [2]:
import requests
import json
import gzip
import shutil
import time
import os
from io import BytesIO
import boto3
from botocore import UNSIGNED
from botocore.config import Config

In [3]:
# Set up S3 client with unsigned configuration for public access
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED), region_name='us-west-2')

# Define the bucket name
bucket_name = 'vcthackathon-data'

# Specify the league and year to explore
LEAGUE = "vct-international"
YEAR = 2024

def list_s3_objects(prefix=''):
    """
    List objects in an S3 bucket given a prefix.
    """
    try:
        # List objects with the specified prefix
        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

        if 'Contents' in response:
            for obj in response['Contents']:
                print(obj['Key'])
        else:
            print("No objects found with the given prefix.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Explore the vct-international league for the specified year

prefix = f'{LEAGUE}/'
#prefix = ''
print(f"Exploring objects in {prefix}:\n")
list_s3_objects(prefix)


Exploring objects in vct-international/:

vct-international/esports-data/leagues.json.gz
vct-international/esports-data/mapping_data.json.gz
vct-international/esports-data/mapping_data_v2.json.gz
vct-international/esports-data/players.json.gz
vct-international/esports-data/teams.json.gz
vct-international/esports-data/tournaments.json.gz
vct-international/games/2022/val:004b09b1-4dc9-4185-baff-9b1c66b3ef99.json.gz
vct-international/games/2022/val:027d5418-4afc-4bfc-9308-238f3742d3cf.json.gz
vct-international/games/2022/val:02fe60b7-71cf-4d3c-90ec-3e0b5ee1c545.json.gz
vct-international/games/2022/val:06444f33-6b5f-4dc9-ba29-9bc39e45d371.json.gz
vct-international/games/2022/val:07a8f609-6c81-4a42-b685-246ccdecc52e.json.gz
vct-international/games/2022/val:080c51d4-9bdf-4e63-be96-508c66b9a492.json.gz
vct-international/games/2022/val:0c382722-7789-461c-865b-50611b9e4e6f.json.gz
vct-international/games/2022/val:133d0c97-961e-4454-9e7b-b58cd76e644a.json.gz
vct-international/games/2022/val:1370

In [9]:
# Base URL of the S3 bucket
S3_BUCKET_URL = "https://vcthackathon-data.s3.us-west-2.amazonaws.com"

def download_and_peek(file_path, num_items=5):
    """
    Downloads a gzipped file from S3, decompresses it, and reads the first few values.
    Supports both JSON and XML formats.
    
    :param file_path: The path of the file in the S3 bucket.
    :param num_items: Number of items or lines to preview.
    """
    # Construct the full URL
    file_url = f"{S3_BUCKET_URL}/{file_path}"
    response = requests.get(file_url, stream=True)
    
    if response.status_code == 200:
        gzip_bytes = BytesIO(response.content)
        with gzip.GzipFile(fileobj=gzip_bytes, mode="rb") as gzipped_file:
            if file_path.endswith('.json.gz'):
                # Handle JSON files
                data = json.load(gzipped_file)
                # Print the first few items if it's a list or a few key-value pairs if it's a dictionary
                if isinstance(data, list):
                    print(json.dumps(data[:num_items], indent=4))
                elif isinstance(data, dict):
                    # Display a subset of key-value pairs
                    items = list(data.items())[:num_items]
                    print(json.dumps(dict(items), indent=4))
                else:
                    print("Unsupported JSON structure")
            elif file_path.endswith('.xml.gz'):
                # Handle XML files
                xml_content = gzipped_file.read()
                root = ET.fromstring(xml_content)
                # Print the first few elements in the XML
                for elem in list(root)[:num_items]:
                    print(ET.tostring(elem, encoding='utf8').decode('utf8'))
            else:
                print(f"Unsupported file type: {file_path}")
    else:
        print(f"Failed to download {file_path}. Status code: {response.status_code}")

# Examples of exploring files from the output list
files_to_explore = [
    'vct-international/games/2022/val:004b09b1-4dc9-4185-baff-9b1c66b3ef99.json.gz'
]

# Iterate and explore each file
for file_path in files_to_explore:
    print(f"Exploring {file_path}:")
    download_and_peek(file_path, num_items=5)  # Adjust num_items as needed
    print("\n" + "-"*80 + "\n")


Exploring vct-international/games/2022/val:004b09b1-4dc9-4185-baff-9b1c66b3ef99.json.gz:
[
    {
        "platformGameId": "val:004b09b1-4dc9-4185-baff-9b1c66b3ef99",
        "metadata": {
            "serverInfo": {
                "processId": "",
                "rfc190Scope": "aresriot.aws-rclusterprod-usw2-1.tournament-gp-offline-8.ares.gameserver"
            },
            "playback": 0,
            "sequenceNumber": 0,
            "stage": 1,
            "gameVersion": "05.00.00.729462",
            "gameId": {
                "value": "004b09b1-4dc9-4185-baff-9b1c66b3ef99"
            },
            "eventTime": {
                "includedPauses": "0.042s",
                "omittingPauses": "0.042s"
            },
            "wallTime": "2022-07-10T17:54:30.960Z"
        },
        "snapshot": {
            "observers": [
                {
                    "displayName": "CentralOBS04",
                    "observerId": {
                        "value": 11
               

In [5]:
import requests
import gzip
import os

S3_BUCKET_URL = "https://vcthackathon-data.s3.us-west-2.amazonaws.com"

def download_and_save_gz_file(file_path, save_directory='.'):
    """
    Downloads a gzipped file from S3 and saves it locally without decompressing in memory.
    
    :param file_path: The path of the file in the S3 bucket.
    :param save_directory: The local directory where the file will be saved.
    """
    file_url = f"{S3_BUCKET_URL}/{file_path}"
    response = requests.get(file_url, stream=True)
    
    if response.status_code == 200:
        # Get the base filename without .gz extension
        if file_path.endswith('.gz'):
            filename = os.path.basename(file_path)[:-3]  # Remove .gz
        else:
            filename = os.path.basename(file_path)
            
        os.makedirs(save_directory, exist_ok=True)
        save_path = os.path.join(save_directory, filename)

        # Stream download and decompress directly to disk
        with open(save_path, 'wb') as f_out:
            with gzip.open(response.raw, 'rb') as gzipped_file:
                while True:
                    chunk = gzipped_file.read(1024 * 1024)  # Read 1MB at a time
                    if not chunk:
                        break
                    f_out.write(chunk)
        print(f"Downloaded and saved {file_path} as {save_path}")
    else:
        print(f"Failed to download {file_path}. Status code: {response.status_code}")

# List of files to download
files_to_download = [
    'vct-international/games/2022/val:2280e732-844c-4124-a1da-6eafdc47eac4.json.gz'
]

# Download and save each file
for file_path in files_to_download:
    download_and_save_gz_file(file_path, save_directory='data')


Downloaded and saved vct-international/games/2022/val:2280e732-844c-4124-a1da-6eafdc47eac4.json.gz as data\val:2280e732-844c-4124-a1da-6eafdc47eac4.json


In [6]:
import requests
import gzip
import os

S3_BUCKET_URL = "https://vcthackathon-data.s3.us-west-2.amazonaws.com"

def download_and_save_gz_file(file_path, save_directory='.', new_filename=None):
    """
    Downloads a gzipped file from S3 and saves it locally with a custom filename.
    
    :param file_path: The path of the file in the S3 bucket.
    :param save_directory: The local directory where the file will be saved.
    :param new_filename: Custom name for the saved file (without the .gz extension).
    """
    file_url = f"{S3_BUCKET_URL}/{file_path}"
    response = requests.get(file_url, stream=True)
    
    if response.status_code == 200:
        # Get the base filename without .gz extension
        if new_filename:
            filename = new_filename  # Use the custom filename if provided
        else:
            filename = os.path.basename(file_path)[:-3]  # Remove .gz by default
            
        os.makedirs(save_directory, exist_ok=True)
        save_path = os.path.join(save_directory, filename)

        # Stream download and decompress directly to disk
        with open(save_path, 'wb') as f_out:
            with gzip.open(response.raw, 'rb') as gzipped_file:
                while True:
                    chunk = gzipped_file.read(1024 * 1024)  # Read 1MB at a time
                    if not chunk:
                        break
                    f_out.write(chunk)
        print(f"Downloaded and saved {file_path} as {save_path}")
    else:
        print(f"Failed to download {file_path}. Status code: {response.status_code}")

# List of files to download
files_to_download = [
    'vct-international/games/2022/val:2280e732-844c-4124-a1da-6eafdc47eac4.json.gz'
]

# Download and save each file with a custom name
for file_path in files_to_download:
    # Specify the desired custom filename here
    custom_filename = "game_data_2022.json"  # Change to whatever you want
    download_and_save_gz_file(file_path, save_directory='data', new_filename=custom_filename)


Downloaded and saved vct-international/games/2022/val:2280e732-844c-4124-a1da-6eafdc47eac4.json.gz as data\game_data_2022.json
