# Global StreetScapes

In [130]:
from utils import download_from_huggingface
import pandas as pd
import duckdb
import os


## Data Extraction

In [131]:
# Download Global StreetScapes data from Hugging Face
# The entire folder is approximately 37GB. You can choose to download specific files instead.
repo_id = "NUS-UAL/global-streetscapes"
repo_type = "dataset"
folder_path = "data/"
local_dir_folder = "../../data/0-raw/Global-StreetScapes/"
local_dir_files = "../../data/0-raw/Global-StreetScapes/info"
file_list = ["cities688.csv", "info.csv"]

# Download the entire 'data' folder directly into the specified local directory
download_from_huggingface(repo_id, repo_type, folder_path=folder_path, local_dir=local_dir_folder)

# Download specific files into the 'info' directory
download_from_huggingface(repo_id, repo_type, file_paths=file_list, local_dir=local_dir_files)


Downloading files:   0%|          | 0/21 [00:00<?, ?file/s]

File 'climate.csv' already exists in the directory. Skipping download.
File 'contextual.csv' already exists in the directory. Skipping download.
File 'ephem.csv' already exists in the directory. Skipping download.
File 'gadm.csv' already exists in the directory. Skipping download.
File 'ghsl.csv' already exists in the directory. Skipping download.
File 'h3.csv' already exists in the directory. Skipping download.
File 'instances.csv' already exists in the directory. Skipping download.
File 'metadata_common_attributes.csv' already exists in the directory. Skipping download.
File 'metadata_kv.csv' already exists in the directory. Skipping download.
File 'metadata_mly1.csv' already exists in the directory. Skipping download.
File 'metadata_mly2.csv' already exists in the directory. Skipping download.
File 'metadata_mly3.csv' already exists in the directory. Skipping download.
File 'metadata_mly4.csv' already exists in the directory. Skipping download.
File 'metadata_mly5.csv' already exist

Downloading files:   0%|          | 0/2 [00:00<?, ?file/s]

File 'cities688.csv' already exists in the directory. Skipping download.
File 'info.csv' already exists in the directory. Skipping download.


## Data exploration

In [132]:
# Let's select some cities where we have good amount of images and different climates
cities_df = pd.read_csv('../../data/0-raw/Global-StreetScapes/info/cities688.csv')

# Select the desired columns
selected_columns = ['city', 'img_count', 'continent', 'koppen_geiger_zone', 'zone_description']

# Create a list of cities to filter
selected_cities = ['Amsterdam', 'Barcelona', 'Berlin', 'London', 'Athens', 'Lisbon']

# Create a table of selected cities with the desired columns, ordered by img_count in descending order
cities_table = cities_df[cities_df['city'].isin(selected_cities)][selected_columns].sort_values('img_count', ascending=False)

# Display the selected cities
cities_table


Unnamed: 0,city,img_count,continent,koppen_geiger_zone,zone_description
456,Berlin,281033,Europe,Cfb,"Marine west coast, warm summer"
396,Athens,106003,Europe,Csa,"Mediterranean, hot summer"
321,Lisbon,87461,Europe,Csa,"Mediterranean, hot summer"
368,London,41837,Europe,Cfb,"Marine west coast, warm summer"
221,Amsterdam,26421,Europe,Cfb,"Marine west coast, warm summer"
508,Barcelona,21218,Europe,Csa,"Mediterranean, hot summer"


In [133]:

# Read the info.csv file
info_df = pd.read_csv('../../data/0-raw/Global-StreetScapes/info/info.csv')

# Display only 'Filename' and 'Overview' columns where 'Filename' is not null
display_columns = ['Filename', 'Overview', 'Notes']
display_df = info_df[display_columns].dropna(subset=['Filename'])

# Reset the index and drop it to remove the index column
display_df = display_df.reset_index(drop=True)

# Set display options to show full content of 'Overview' column
pd.set_option('display.max_colwidth', None)

# Display the data
display(display_df)

# Reset display options to default
pd.reset_option('display.max_colwidth')

Unnamed: 0,Filename,Overview,Notes
0,climate.csv,Contains the Koppen climate zone associated with each image's location.,"The calculation is as accurate as the location of the image given by the source, which also relies on the accuracy of the capturing devices. The accuracy could also be affected by the accuracy of the Koppen climate zone classification API from https://github.com/sco-tt/Climate-Zone-API."
1,contextual.csv,Contains the eight contextual attributes inferred for each image.,Please refer to Table 3 in the paper for information on accuracy.
2,ephem.csv,Contains the temporal information of each image calculated using the python package 'PyEphem' with regards to the time of day.,"The accuracy of the calculation is as accurate as the timestamp of the image given by the source, which also relies on the accuracy of the capturing devices. The accuracy could also be affected by the accuracy of PyEphem."
3,gadm.csv,"Contains the administrative area associated with each image, at all available levels.","The calculation is as accurate as the location of the image given by the source, which also relies on the accuracy of the capturing devices. The accuracy could also be affected by the accuracy of the GADM database."
4,ghsl.csv,"Contains the degree of urbanisation associated with the location of the image, calculated using the Global Human Settlement Layer (GHSL).","The calculation is as accurate as the location of the image given by the source, which also relies on the accuracy of the capturing devices. The accuracy could also be affected by the accuracy of the GHSL dataset."
5,h3.csv,"Contains the ID of the h3-indexed hexagon associated with each image, at all available resolution levels from level 0 to 15.","The calculation is as accurate as the location of the image given by the source, which also relies on the accuracy of the capturing devices."
6,instances.csv,Contains the count of instances (65 categories) detected in each image.,Based on panoptic segmentation results obtained with Mask2former model. Accuracy is dependent on model performance.
7,metadata_common_attributes.csv,"Contains the common basic metadata attributes that are provided by both Mapillary and KartaView, and those we computed for both sources.","Accuracy is subject to that of the original metadata provided by Mapillary / KartaView, which also largely depends on the accuracy of data provided by the capturing devices."
8,metadata_kv.csv,Contains the metadata of each SVI originally provided by KartaView.,"The explanation of the fields is largely based on our interpretation of the documentation of KartaView API, which can be incomplete or absent for many attributes."
9,metadata_mly1.csv,Contains the metadata of each SVI originally provided by Mapillary.,"Split into five parts (metadata_mly1, metadata_mly2, metadata_mly3, metadata_mly4, metadata_mly5) to reduce file size."


In [134]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

# Load the CSV and propagate Filename values
info_df = pd.read_csv('../../data/0-raw/Global-StreetScapes/info/info.csv').ffill()

# Set display options to show full content of all columns
pd.set_option('display.max_colwidth', None)

# Create a dropdown for Filename selection with an "All" option
filename_dropdown = widgets.Dropdown(
    options=['Select a filename'] + info_df['Filename'].unique().tolist(),
    value='Select a filename',  # Default to the indicator value
    description='Filename:',
    layout=widgets.Layout(width='30%')
)

# Create a search text box for Field filtering without a dropdown
field_search = widgets.Text(
    placeholder='Type to search fields...',
    description='Field:',
    layout=widgets.Layout(width='30%')
)

output = widgets.Output()

# Function to filter and display data based on selected filename and field
def filter_data(change=None):
    with output:
        clear_output(wait=True)
        
        # If the placeholder option is selected, show a message
        if filename_dropdown.value == 'Select a filename':
            display(HTML("<p><b>Please select a filename to view the data.</b></p>"))
            return
        
        # Filter the data
        filtered_df = info_df.copy()
        
        # Filter by Filename if not "All"
        if filename_dropdown.value != 'All':
            filtered_df = filtered_df[filtered_df['Filename'] == filename_dropdown.value]

        # Filter by field if there is text in the field_search box
        if field_search.value.strip():
            filtered_df = filtered_df[filtered_df['Field'].str.contains(field_search.value, case=False, na=False)]

        # Display the filtered data with adjusted column widths
        if not filtered_df.empty:
            html_table = filtered_df[['Filename', 'Field', 'Format', 'Explanation']].to_html(index=False)
            styled_table = f"""
            <style>
            table {{width: 100%;}}
            th, td {{
                word-wrap: break-word;
                max-width: 700px;
                overflow: hidden;
                text-overflow: ellipsis;
            }}
            th:nth-child(1), td:nth-child(1) {{max-width: 120px;}}  /* Filename column */
            th:nth-child(4), td:nth-child(4) {{max-width: 600px;}}  /* Explanation column */
            </style>
            {html_table}
            """
            display(HTML(styled_table))
        else:
            display(HTML("<p>No results found.</p>"))

# Attach functions to widget events
filename_dropdown.observe(filter_data, names='value')
field_search.observe(filter_data, names='value')

# Display widgets
widget_box = widgets.HBox([filename_dropdown, field_search])
display(widget_box, output)

# Initial message to guide the user
filter_data()

# Reset display options to default
pd.reset_option('display.max_colwidth')

HBox(children=(Dropdown(description='Filename:', layout=Layout(width='30%'), options=('Select a filename', 'cl…

Output()

## Data Manipulation