# Scraping Wikipedia pages using geosearch

This notebook uses the Wikipedia API to do a geosearch (based on longitude, latitude, radius) for matching Wikipedia articles, and then retrieves the intro texts of each of the matched articles. Results are exported to a csv file and visualized on a map.

See MediaWiki API: https://www.mediawiki.org/wiki/Extension:GeoData

Written by Dennis van den Berg.

## Libraries

In [1]:
import requests
import pandas as pd
from osmxtract import overpass, location
import geopandas as gpd
import folium
from folium import plugins
import base64
from IPython.display import HTML
%matplotlib inline

## Functions

In [41]:
def chunks(list, chunksize):
    """Yield successive chunks from list."""
    for chunk_number in range(0, len(list), chunksize):
        yield list[chunk_number : chunk_number + chunksize]


def wikipedia_get_intro_texts(page_names, query_size=10):
    """Get intro texts for list of pages"""
    
    # Create empty dataframe for intro texts
    df_intro_texts_all = pd.DataFrame()
    
    # Split list of page_names into chunks of length query_size and do multiple queries
    for page_names_chunk in chunks(list=page_names, chunksize=query_size):
    
        # Wikipedia intro text query
        page_names_string = '|'.join(page_names_chunk)
        url = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles={}" \
            .format(page_names_string)

        # API call
        intro_texts = requests.get(url)

        # Unpack json and convert to dataframe
        df_intro_texts = pd.DataFrame(intro_texts.json()['query']['pages']).transpose()
        
        # Append results
        df_intro_texts_all = df_intro_texts_all.append(df_intro_texts)
    
    # Cleaning: remove newlines in 'extract' column
    df_intro_texts_all['extract'] = df_intro_texts_all['extract'].str.replace('\n', ' ').replace('\r', '')
    
    return(df_intro_texts_all)


def wikipedia_geosearch(lat, lon, radius_meters = 10000, max_results = 500, output_file=None):
    """Get wikipedia matches by lat/lon coordinates"""
    
    # Wikipedia geosearch query
    url = "https://en.wikipedia.org/w/api.php?action=query&list=geosearch&format=json&gslimit={}&gsradius={}&gscoord={}|{}" \
        .format(str(max_results), str(radius_meters), str(lat), str(lon))

    # API call for pages matching location
    geo_results = requests.get(url)

    # Unpack json and convert to dataframe
    df_geo_results = pd.DataFrame(geo_results.json()['query']['geosearch'])
    
    # API call for intro texts of pages
    page_names = list(df_geo_results['title'])
    df_intro_texts = wikipedia_get_intro_texts(page_names)
    
    # Merge intro_texts into geo_results
    df_geo_results = df_geo_results.merge(df_intro_texts, how='left')
    
    # Write to file if needed
    if(output_file!=None):
        print("Results saved to file:", output_file)
        df_geo_results.to_csv(output_file, sep="\t", index=False)
    
    return(df_geo_results)


def visualize_map(df, label_var = 'title'):
    """Visualize lat/lon locations on map using Folium"""
    
    # Start lon/lat
    latitude = df.iloc[0].loc[['lat']]
    longitude = df.iloc[0].loc[['lon']]
    
    # Initialize map
    map = folium.Map(location = [latitude, longitude], zoom_start = 12)
 
    # Instantiate a mark cluster object for the locations in the dataframe
    locations = plugins.MarkerCluster().add_to(map)
 
    # Loop through the dataframe and add each data point to the mark cluster
    for lat, lon, label, in zip(df['lat'], df['lon'], df[label_var]):
        # popup_string = '<a href=" {} "target="_blank"> {} </a>'.format('https://en.wikipedia.org/wiki/'+label, label)
        popup_string = label
        
        folium.Marker(
            location=[lat, lon],
            icon=None,
            popup = folium.Popup(popup_string),
        ).add_to(locations)

    # Display map
    return(map)


def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)


def get_osm_results(search_location, tag = 'tourism', tag_values = ['museum'], max_distance = 10000):
    """
    Queries OpenStreetMap for <tag>=<tag_values>, 
    in area of 'max_distance' away from location 'search_location'
    """

    # Get bounding box coordinates from a buffer around specific location
    lat, lon = location.geocode(search_location)
    bounds = location.from_buffer(lat, lon, buffer_size=max_distance)

    # Build an overpass QL query and get the JSON response
    query = overpass.ql_query(bounds, tag=tag, values=tag_values)
    response = overpass.request(query)

    # Parse results as GeoJSON
    feature_collection = overpass.as_geojson(response, 'point')

    # To GeoPandas GeoDataFrame
    results = gpd.GeoDataFrame.from_features(feature_collection)
    
    # Create separate lon/lat columns from geometry
    results['lon'] = results.geometry.x
    results['lat'] = results.geometry.y

    return results

## Retrieve data and convert to downloadable format

In [57]:
# Settings
search_location = 'Utrecht'
latitude, longitude = location.geocode(search_location)    
#latitude = 52.0894444
#longitude = 5.1077981
max_results = 200
output_file = search_location+".csv"

# Search wikipedia by location
df_wiki_locations = wikipedia_geosearch(lat = latitude, 
                                 lon = longitude, 
                                 max_results = max_results, 
                                 output_file = output_file)

# Adding some columns (for data export format)
df_wiki_locations['type'] = 9
df_wiki_locations['picture'] = "blank.jpg"
df_wiki_locations['col7'] = "XYZ"
df_wiki_locations['col9'] = "XYZ"
df_wiki_locations['col10'] = "XYZ"
df_wiki_locations['col11'] = "BLANK"
# Reordering
df_download = df_wiki_locations[['type', 'pageid', 'title', 'picture', 'lon', 'lat', 'col7', 'extract', 'col9', 'col10', 'col11']]

# Make downloadable
create_download_link(df_download, filename=output_file)

Results saved to file: Utrecht.csv


## Explore data

In [5]:
df_wiki_locations.head()

Unnamed: 0,pageid,ns,title,lat,lon,dist,primary,extract,type,picture,col7,col9,col10,col11
0,8332984,0,Utrecht Centraal railway station,52.089167,5.109722,135.0,,"<p class=""mw-empty-elt""> </p> <p><b>Utrecht Ce...",9,blank.jpg,XYZ,XYZ,XYZ,BLANK
1,8539939,0,Catharijne,52.089722,5.111667,266.1,,<p><b>Catharijne</b> is a former municipality ...,9,blank.jpg,XYZ,XYZ,XYZ,BLANK
2,35615082,0,Rabobank Bestuurscentrum,52.086944,5.108611,283.5,,<p>The <b>Rabobank Bestuurscentrum</b> or <b>R...,9,blank.jpg,XYZ,XYZ,XYZ,BLANK
3,32706904,0,Jaarbeurs,52.0868,5.10548,334.0,,<p>The <b>Jaarbeurs</b> (Yearly Fair) is an ex...,9,blank.jpg,XYZ,XYZ,XYZ,BLANK
4,28276794,0,TivoliVredenburg,52.092222,5.113333,488.3,,<p>The <b>TivoliVredenburg</b> is a contempora...,9,blank.jpg,XYZ,XYZ,XYZ,BLANK


## Visualize on a map

In [6]:
visualize_map(df_wiki_locations)

# Retrieve locations from OpenStreetMap

In [50]:
# Settings
search_location = 'Utrecht Lunetten'
tag = 'tourism'
tag_values = ['museum']
max_distance = 50000
output_file = 'museums.csv'

# Get OSM results
df_results = get_osm_results(search_location = search_location, tag = tag, tag_values = tag_values, max_distance = max_distance)

# Display first 3 results
df_results.head(3)

Unnamed: 0,geometry,addr:city,addr:housenumber,addr:postcode,addr:street,name,source,source:date,tourism,website,...,toilets,official_name,contact:email,fax,website_alt,telephone,addr:housename,heritage,lon,lat
0,POINT (5.67194 51.60359),Boekel,22.0,5427BG,Sint Janplein,Heemkundekring Sint Achten op Boeckel,BAG,2014-05-07,museum,http://www.heemkundeboekel.nl/,...,,,,,,,,,5.67194,51.603593
1,POINT (4.43619 51.69284),,,,,Mauritshuis,,,museum,,...,,,,,,,,,4.436189,51.692835
2,POINT (4.60891 51.64640),,,,,Oudheidkamer Willem van Strijen,,,museum,,...,,,,,,,,,4.608911,51.646399


In [51]:
# Save results to file
df_results.to_csv(output_file)
create_download_link(df_results, title = "Download CSV file", filename = output_file)

In [52]:
# Plot results on map
visualize_map(df_results, label_var = 'name')