In [1]:
import pandas as pd
import geopandas as gpd
import requests
import time
from shapely.geometry import Point
from geopandas.tools import sjoin

In [2]:
# Import of the Polygons
gdf_combined = gpd.read_file("C:/Users/edoar/combined_quartieri.geojson")

# YELP API

In [4]:
neighborhoods = [f"{name}, Milan" for name in gdf_combined["Neighborhood"].tolist()]
neighborhoods

['Parco delle Abbazie, Milan',
 'Adriano, Milan',
 'Affori, Milan',
 'Baggio, Milan',
 'Bande Nere, Milan',
 'Barona, Milan',
 'Bicocca, Milan',
 'Bovisasca, Milan',
 'Bovisa, Milan',
 'Brera, Milan',
 'Bruzzano, Milan',
 'Buenos Aires - Venezia, Milan',
 'Cantalupa, Milan',
 'Cascina Triulza - Expo, Milan',
 'Centrale, Milan',
 'Chiaravalle, Milan',
 'Città Studi, Milan',
 'Comasina, Milan',
 'Corsica, Milan',
 'De Angeli - Monte Rosa, Milan',
 'Dergano, Milan',
 'Duomo, Milan',
 'Ex Om - Morivione, Milan',
 'Farini, Milan',
 'GARIBALDI REPUBBLICA, Milan',
 'Gallaratese, Milan',
 'Ghisolfa, Milan',
 'Giambellino, Milan',
 'Giardini Porta Venezia, Milan',
 'Gratosoglio - Ticinello, Milan',
 'Greco, Milan',
 'Guastalla, Milan',
 'Isola, Milan',
 'Lambrate, Milan',
 'Lodi - Corvetto, Milan',
 'Lorenteggio, Milan',
 'Loreto, Milan',
 'Maciachini - Maggiolina, Milan',
 'Magenta - San Vittore, Milan',
 'Maggiore - Musocco, Milan',
 'Mecenate, Milan',
 'Muggiano, Milan',
 'Navigli, Milan',
 

In [5]:
# Replace with your Yelp API key
API_KEY = "6CbFJGYPopltfBbkb00hIFGRI4XBrAsccPevTJ53ol4YIuJrF48kEylTmKvEl4-TEP8p0pEc3ydzUsatpgGI5aqSHCsysi5-yWyeJD3V-6al5x6_AcFkw23LehKEZ3Yx"
HEADERS = {"Authorization": f"Bearer {API_KEY}"}

# Constants
businesses_per_request = 48  # Maximum allowed by Yelp per request

def make_request(url, params=None):
    """
    Makes a request to the Yelp API.
    """
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    return response

def search_businesses(location, term="museum", limit=businesses_per_request, offset=0):
    """
    Searches for businesses (museums) in a given location with pagination.
    """
    url = "https://api.yelp.com/v3/businesses/search"
    params = {
        "location": location,
        "term": term,
        "limit": limit,
        "offset": offset
    }
    response = make_request(url, params=params)
    return response.json().get("businesses", [])

# Initialize data storage
data = []

try:
    for neighborhood in neighborhoods:  # Loop through each neighborhood
        offset = 0
        while True:  # Keep fetching data until no more results
            print(f"Fetching museums in {neighborhood} with offset: {offset}...")

            try:
                # fetch businesses using the current offset and location
                businesses = search_businesses(location=neighborhood, term="museum", limit=businesses_per_request, offset=offset)

                if not businesses:
                    # No more businesses to fetch
                    print(f"No more museums returned for {neighborhood}.")
                    break

                for biz in businesses:
                    name = biz.get("name", "N/A")
                    location_info = biz.get("location", {})
                    address = location_info.get("address1", "N/A")
                    categories = biz.get("categories", [])
                    category_list = [cat.get("title", "") for cat in categories if cat.get("title")]
                    category_str = ", ".join(category_list) if category_list else "N/A"
                    rating = biz.get("rating", "N/A")
                    review_count = biz.get("review_count", "N/A")

                    # Extract coordinates
                    coordinates = biz.get("coordinates", {})
                    latitude = coordinates.get("latitude", None)
                    longitude = coordinates.get("longitude", None)

                    # Append to data
                    data.append({
                        "Fetch Location": neighborhood,
                        "Museum Name": name,
                        "Museum Address": address,
                        "Categories": category_str,
                        "Average Star Rating": rating,
                        "Review Count": review_count,
                        "Latitude": latitude,
                        "Longitude": longitude
                    })

                # Increment offset for the next batch
                offset += len(businesses)

                # Optional: Sleep to respect API rate limits
                time.sleep(0.5)

                # Break if the offset exceeds Yelp's maximum results per query
                if offset >= 240:  # Maximum 240 results per query
                    print(f"Reached maximum results for {neighborhood}.")
                    break

            except requests.HTTPError as he:
                # Log the error and skip this neighborhood
                print(f"HTTP error occurred for {neighborhood}: {he}")
                break

    # Convert the collected data into a DataFrame
    df = pd.DataFrame(data)

    # Convert DataFrame to GeoDataFrame
    df["geometry"] = df.apply(
        lambda row: Point(row["Longitude"], row["Latitude"]) if row["Longitude"] and row["Latitude"] else None,
        axis=1
    )
    Museums = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")

    # Print summary
    print(Museums.head())
    print(f"Total museums collected: {len(data)}")

except Exception as e:
    print(f"An unexpected error occurred: {e}")


Fetching museums in Parco delle Abbazie, Milan with offset: 0...
Fetching museums in Parco delle Abbazie, Milan with offset: 48...
Fetching museums in Parco delle Abbazie, Milan with offset: 96...
Fetching museums in Parco delle Abbazie, Milan with offset: 144...
Fetching museums in Parco delle Abbazie, Milan with offset: 192...
Reached maximum results for Parco delle Abbazie, Milan.
Fetching museums in Adriano, Milan with offset: 0...
HTTP error occurred for Adriano, Milan: 400 Client Error: Bad Request for url: https://api.yelp.com/v3/businesses/search?location=Adriano%2C+Milan&term=museum&limit=48&offset=0
Fetching museums in Affori, Milan with offset: 0...
Fetching museums in Affori, Milan with offset: 48...
Fetching museums in Affori, Milan with offset: 96...
Fetching museums in Affori, Milan with offset: 103...
No more museums returned for Affori, Milan.
Fetching museums in Baggio, Milan with offset: 0...
Fetching museums in Baggio, Milan with offset: 48...
Fetching museums in Ba

In [6]:
Museums

Unnamed: 0,Fetch Location,Museum Name,Museum Address,Categories,Average Star Rating,Review Count,Latitude,Longitude,geometry
0,"Parco delle Abbazie, Milan",Il Mondo di Leonardo,Piazza della Scala 1,Museums,4.5,8,45.466500,9.189920,POINT (9.18992 45.4665)
1,"Parco delle Abbazie, Milan",Pinacoteca di Brera,Via Brera 28,"Museums, Landmarks & Historical Buildings",4.4,39,45.471962,9.187842,POINT (9.18784 45.47196)
2,"Parco delle Abbazie, Milan",Armani / Silos,Via Bergognone 40,Art Galleries,4.4,5,45.452744,9.164432,POINT (9.16443 45.45274)
3,"Parco delle Abbazie, Milan",Museo del Novecento,Via Marconi 1,Museums,4.2,36,45.463500,9.191450,POINT (9.19145 45.4635)
4,"Parco delle Abbazie, Milan",Museo Nazionale della Scienza e della Tecnologia,Via San Vittore 21,Museums,4.2,20,45.461129,9.171357,POINT (9.17136 45.46113)
...,...,...,...,...,...,...,...,...,...
11497,"Forze Armate, Milan",Municipio di S. Pellegrino Terme - Biblioteca,"Via S. Carlo, SNC","Museums, Libraries",0.0,0,45.837890,9.664110,POINT (9.66411 45.83789)
11498,"Forze Armate, Milan",Antiche Mura,Viale Partigiani 55,Museums,2.0,1,44.902880,8.196650,POINT (8.19665 44.90288)
11499,"Forze Armate, Milan",Fondazione Castello di Padernello,"Via Cavour, 1",Museums,0.0,0,45.359052,9.994929,POINT (9.99493 45.35905)
11500,"Forze Armate, Milan",Seminario Vescovile,"Via Milano, 5","Museums, Elementary Schools, Specialty Schools...",0.0,0,45.142580,10.001750,POINT (10.00175 45.14258)


In [7]:
# Removing the dupes
Museums_nodup = Museums.drop_duplicates(subset=['Museum Name','Museum Address','Categories','Average Star Rating',
                                                        'Review Count','geometry'])
Museums_nodup

Unnamed: 0,Fetch Location,Museum Name,Museum Address,Categories,Average Star Rating,Review Count,Latitude,Longitude,geometry
0,"Parco delle Abbazie, Milan",Il Mondo di Leonardo,Piazza della Scala 1,Museums,4.5,8,45.466500,9.189920,POINT (9.18992 45.4665)
1,"Parco delle Abbazie, Milan",Pinacoteca di Brera,Via Brera 28,"Museums, Landmarks & Historical Buildings",4.4,39,45.471962,9.187842,POINT (9.18784 45.47196)
2,"Parco delle Abbazie, Milan",Armani / Silos,Via Bergognone 40,Art Galleries,4.4,5,45.452744,9.164432,POINT (9.16443 45.45274)
3,"Parco delle Abbazie, Milan",Museo del Novecento,Via Marconi 1,Museums,4.2,36,45.463500,9.191450,POINT (9.19145 45.4635)
4,"Parco delle Abbazie, Milan",Museo Nazionale della Scienza e della Tecnologia,Via San Vittore 21,Museums,4.2,20,45.461129,9.171357,POINT (9.17136 45.46113)
...,...,...,...,...,...,...,...,...,...
9930,"Stephenson, Milan",Parrocchia di Ganna - Badia,"Via Perego, 3",Museums,0.0,0,45.902450,8.822370,POINT (8.82237 45.90245)
9945,"Stephenson, Milan",Monastero di Torba,Via Stazione,Museums,3.0,1,45.729044,8.863733,POINT (8.86373 45.72904)
9946,"Stephenson, Milan",Fondazione Francesco Pellin,"Via S. Albino, 24",Museums,0.0,0,45.800990,8.830980,POINT (8.83098 45.80099)
9947,"Stephenson, Milan",Muel Museo Elettronico International,"Via Francesco del Cairo, 4",Museums,0.0,0,45.819400,8.825840,POINT (8.82584 45.8194)


In [30]:
# Have to be the same type
print(Museums_nodup.crs)  # Restaurants CRS
print(gdf_combined.crs)  # Neighborhood polygons CRS

EPSG:4326
EPSG:4326


# Spatial Join

In [33]:
# Perform the spatial join
joined = sjoin(Museums_nodup, gdf_combined, how="left", predicate="within")

# Check the result
joined

Unnamed: 0,Fetch Location,Museum Name,Museum Address,Categories,Average Star Rating,Review Count,Latitude,Longitude,geometry,index_right,Neighborhood
0,"Parco delle Abbazie, Milan",Il Mondo di Leonardo,Piazza della Scala 1,Museums,4.5,8,45.466500,9.189920,POINT (9.18992 45.4665),21.0,Duomo
1,"Parco delle Abbazie, Milan",Pinacoteca di Brera,Via Brera 28,"Museums, Landmarks & Historical Buildings",4.4,39,45.471962,9.187842,POINT (9.18784 45.47196),9.0,Brera
2,"Parco delle Abbazie, Milan",Armani / Silos,Via Bergognone 40,Art Galleries,4.4,5,45.452744,9.164432,POINT (9.16443 45.45274),74.0,Tortona
3,"Parco delle Abbazie, Milan",Museo del Novecento,Via Marconi 1,Museums,4.2,36,45.463500,9.191450,POINT (9.19145 45.4635),21.0,Duomo
4,"Parco delle Abbazie, Milan",Museo Nazionale della Scienza e della Tecnologia,Via San Vittore 21,Museums,4.2,20,45.461129,9.171357,POINT (9.17136 45.46113),38.0,Magenta - San Vittore
...,...,...,...,...,...,...,...,...,...,...,...
448,"Stephenson, Milan",Parrocchia di Ganna - Badia,"Via Perego, 3",Museums,0.0,0,45.902450,8.822370,POINT (8.82237 45.90245),,
449,"Stephenson, Milan",Monastero di Torba,Via Stazione,Museums,3.0,1,45.729044,8.863733,POINT (8.86373 45.72904),,
450,"Stephenson, Milan",Fondazione Francesco Pellin,"Via S. Albino, 24",Museums,0.0,0,45.800990,8.830980,POINT (8.83098 45.80099),,
451,"Stephenson, Milan",Muel Museo Elettronico International,"Via Francesco del Cairo, 4",Museums,0.0,0,45.819400,8.825840,POINT (8.82584 45.8194),,


In [35]:
joined[joined['Museum Name']=='Pinacoteca di Brera'] # correct

Unnamed: 0,Fetch Location,Museum Name,Museum Address,Categories,Average Star Rating,Review Count,Latitude,Longitude,geometry,index_right,Neighborhood
1,"Parco delle Abbazie, Milan",Pinacoteca di Brera,Via Brera 28,"Museums, Landmarks & Historical Buildings",4.4,39,45.471962,9.187842,POINT (9.18784 45.47196),9.0,Brera


In [37]:
joined.nunique()

Fetch Location          12
Museum Name            440
Museum Address         433
Categories              67
Average Star Rating     26
Review Count            43
Latitude               446
Longitude              446
geometry               446
index_right             26
Neighborhood            26
dtype: int64

In [39]:
# Remove rows where the geometry is NaN
PolyMuseums = joined[~joined["Neighborhood"].isna()]

# Reset the index if needed
PolyMuseums = PolyMuseums.reset_index(drop=True)

# Keeping only relevant variables
PolyMuseums = PolyMuseums[["Museum Name", "Museum Address", "Categories", "Average Star Rating",
                           "Review Count", "geometry", "Neighborhood"]]
# Print the cleaned GeoDataFrame
PolyMuseums

Unnamed: 0,Museum Name,Museum Address,Categories,Average Star Rating,Review Count,geometry,Neighborhood
0,Il Mondo di Leonardo,Piazza della Scala 1,Museums,4.5,8,POINT (9.18992 45.4665),Duomo
1,Pinacoteca di Brera,Via Brera 28,"Museums, Landmarks & Historical Buildings",4.4,39,POINT (9.18784 45.47196),Brera
2,Armani / Silos,Via Bergognone 40,Art Galleries,4.4,5,POINT (9.16443 45.45274),Tortona
3,Museo del Novecento,Via Marconi 1,Museums,4.2,36,POINT (9.19145 45.4635),Duomo
4,Museo Nazionale della Scienza e della Tecnologia,Via San Vittore 21,Museums,4.2,20,POINT (9.17136 45.46113),Magenta - San Vittore
...,...,...,...,...,...,...,...
93,Resti del Mausoleo Imperiale,Via degli Olivetani 1,"Landmarks & Historical Buildings, Museums",0.0,0,POINT (9.16843 45.46179),Magenta - San Vittore
94,Studio Museo Achille Castiglioni,Piazza Castello 27,Museums,0.0,0,POINT (9.1772 45.46958),Duomo
95,Pietà Rondanini,Piazza Castello 4,Museums,0.0,0,POINT (9.17881 45.46981),Duomo
96,Palazzo Morando,Via Sant'Andrea 6,"Landmarks & Historical Buildings, Museums",4.5,4,POINT (9.19639 45.46877),Duomo


### In contrast with the restaurants dataframe, this time there are no NaNs in the data retrieved, therefore this dataframe can be considered of high quality

In [7]:
# Rows with NaN
nan_rows = PolyMuseums[PolyMuseums.isna().any(axis=1)]

nan_rows

Unnamed: 0,Museum Name,Museum Address,Categories,Average Star Rating,Review Count,Neighborhood,geometry
