<a href="https://colab.research.google.com/github/elizabethavargas/Dataset-Description-Generation/blob/main/create_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create Dataset

In [None]:
# packages
import random
import requests
import pandas as pd

### Get list of datasets on NYC Open Data

In [7]:
# Base URL for the NYC Open Data Socrata API
base_url = "https://data.cityofnewyork.us/api/views.json"

try:
    response = requests.get(base_url)
    response.raise_for_status()  # Raise an exception for bad status codes
    datasets_data = response.json()

    # Extract id and name for each dataset
    datasets_list = []
    for dataset in datasets_data:
        if 'id' in dataset and 'name' in dataset:
            datasets_list.append({'id': dataset['id'], 'name': dataset['name']})

    # Print confirmation message
    print(f"Successfully listed {len(datasets_list)} datasets.")

except requests.exceptions.RequestException as e:
    print(f"Error fetching data: {e}")
except ValueError:
    print("Error decoding JSON response. The response might not be in JSON format.")

Successfully listed 3018 datasets.


In [None]:
https://data.cityofnewyork.us/api/views/hukm-2fri.json


In [9]:
# Randomly select 10 datasets
num_datasets_to_query = 10
selected_datasets = random.sample(datasets_list, num_datasets_to_query)
selected_datasets

[{'id': 'yv4m-nu6d', 'name': '3K Projects by Site Locations'},
 {'id': '8p6c-94pc',
  'name': '"Kids In Motion" Playground Programming: 2016 to 2021'},
 {'id': '3qgi-jrgw', 'name': 'Seating Locations - Map'},
 {'id': 'j9we-9a43', 'name': 'NYCgov Poverty Measure Data (2009)'},
 {'id': '4e2n-s75z',
  'name': 'Suitability of City-Owned and Leased Property for Urban Agriculture (LL 48 of 2011)'},
 {'id': 'unse-x4pq', 'name': '2022 SHSAT + DISCOVERY OVERALL SUMMARY'},
 {'id': 'kd8f-uxui',
  'name': '2017-2018 Physical Education - Supplemental Programs'},
 {'id': '9y58-8zvz',
  'name': 'Waterfront Public Access Areas (WPAAs) - Access Points'},
 {'id': 'w99a-xtai', 'name': 'Fall 2020 Admissions Report LL1486-A Part II'},
 {'id': 'vnz6-h2k4', 'name': 'Historical License Applications'}]

In [12]:
for dataset in selected_datasets:
    dataset_id = dataset['id']
    dataset_name = dataset['name']
    print(f"\n--- Querying dataset: {dataset_name} (ID: {dataset_id}) ---")

    # Construct the API endpoint URL for the dataset (for data)
    dataset_url = f"https://data.cityofnewyork.us/resource/{dataset_id}.json"
    # Construct the API endpoint URL for the metadata
    #metadata_url = f"https://data.cityofnewyork.us/api/views/{dataset_id}/columns.json"
    metadata_url = f"https://data.cityofnewyork.us/api/views/{dataset_id}.json"


    try:
        # Fetch Data (limited to 5 rows)
        print("Fetching data...")
        data_response = requests.get(dataset_url, params={'$limit': 1})
        data_response.raise_for_status()  # Raise an exception for bad status codes
        data = data_response.json()

        if data:
            df = pd.DataFrame(data)
            print(f"Successfully retrieved data for {dataset_name}:")
            display(df.head())
        else:
            print(f"No data returned for dataset: {dataset_name}")


    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {dataset_name}: {e}")
    except ValueError:
        print(f"Error decoding JSON response for data of {dataset_name}. The response might not be in JSON format.")
    except Exception as e:
        print(f"An unexpected error occurred while processing data for {dataset_name}: {e}")

    try:
        # Fetch Metadata
        print("Fetching metadata...")
        metadata_response = requests.get(metadata_url)
        metadata_response.raise_for_status() # Raise an exception for bad status codes
        metadata = metadata_response.json()

        if metadata:
            print(f"Successfully retrieved metadata for {dataset_name}:")
            # Display metadata - you might want to format this differently
            # For now, printing the whole metadata dictionary
            display(metadata)
            print('SELECT', metadata['name'], 'CAT', metadata['category'], 'DESC', metadata['description'])
            break
        else:
            break
            print(f"No metadata returned for dataset: {dataset_name}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching metadata for {dataset_name}: {e}")
    except ValueError:
        print(f"Error decoding JSON response for metadata of {dataset_name}. The response might not be in JSON format.")
    except Exception as e:
        print(f"An unexpected error occurred while processing metadata for {dataset_name}: {e}")


--- Querying dataset: 3K Projects by Site Locations (ID: yv4m-nu6d) ---
Fetching data...
Successfully retrieved data for 3K Projects by Site Locations:


Unnamed: 0,district,school,borough,location,postcode,latitude,longitude,community_board,council_district,bin,bbl,census_tract,nta
0,2,3K CENTER @ 330 EAST 38TH STREET,MANHATTAN,330 EAST 38TH STREET,10016,40.746742,-73.972741,106,4,1076166,1009437501,78,MN0603


Fetching metadata...
Successfully retrieved metadata for 3K Projects by Site Locations:


{'id': 'yv4m-nu6d',
 'name': '3K Projects by Site Locations',
 'assetType': 'dataset',
 'averageRating': 0,
 'category': 'Education',
 'createdAt': 1542832469,
 'description': '3K Projects by Site Locations',
 'diciBackend': False,
 'displayType': 'table',
 'downloadCount': 1201,
 'hideFromCatalog': False,
 'hideFromDataJson': False,
 'indexUpdatedAt': 1542832881,
 'locked': False,
 'newBackend': True,
 'numberOfComments': 0,
 'oid': 30190517,
 'provenance': 'official',
 'publicationAppendEnabled': False,
 'publicationDate': 1542832786,
 'publicationGroup': 15704724,
 'publicationStage': 'published',
 'rowsUpdatedAt': 1683121099,
 'tableId': 15704724,
 'totalTimesRated': 0,
 'viewCount': 803,
 'viewLastModified': 1683121098,
 'viewType': 'tabular',
 'approvals': [{'reviewedAt': 1542832786,
   'reviewedAutomatically': True,
   'state': 'approved',
   'submissionId': 1077943,
   'submissionObject': 'public_audience_request',
   'submissionOutcome': 'change_audience',
   'submittedAt': 15

SELECT 3K Projects by Site Locations CAT Education DESC 3K Projects by Site Locations
