# Scraping Wikipedia pages using geosearch

This notebook uses the Wikipedia API to do a geosearch (based on longitude, latitude, radius) for matching Wikipedia articles, and then retrieves the intro texts of each of the matched articles. Results are exported to a csv file and visualized on a map.

See MediaWiki API: https://www.mediawiki.org/wiki/Extension:GeoData

Written by Dennis van den Berg.

## Libraries

In [None]:
import requests
import pandas as pd
import folium
from folium import plugins
%matplotlib inline

## Functions

In [None]:
def chunks(list, chunksize):
    """Yield successive chunks from list."""
    for chunk_number in range(0, len(list), chunksize):
        yield list[chunk_number : chunk_number + chunksize]


def wikipedia_get_intro_texts(page_names, query_size=10):
    """Get intro texts for list of pages"""
    
    # Create empty dataframe for intro texts
    df_intro_texts_all = pd.DataFrame()
    
    # Split list of page_names into chunks of length query_size and do multiple queries
    for page_names_chunk in chunks(list=page_names, chunksize=query_size):
    
        # Wikipedia intro text query
        page_names_string = '|'.join(page_names_chunk)
        url = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles={}" \
            .format(page_names_string)

        # API call
        intro_texts = requests.get(url)

        # Unpack json and convert to dataframe
        df_intro_texts = pd.DataFrame(intro_texts.json()['query']['pages']).transpose()
        
        # Append results
        df_intro_texts_all = df_intro_texts_all.append(df_intro_texts)
    
    # Cleaning: remove newlines in 'extract' column
    df_intro_texts_all['extract'] = df_intro_texts_all['extract'].str.replace('\n', ' ').replace('\r', '')
    
    return(df_intro_texts_all)


def wikipedia_geosearch(lat, lon, radius_meters = 10000, max_results = 500, output_file=None):
    """Get wikipedia matches by lat/lon coordinates"""
    
    # Wikipedia geosearch query
    url = "https://en.wikipedia.org/w/api.php?action=query&list=geosearch&format=json&gslimit={}&gsradius={}&gscoord={}|{}" \
        .format(str(max_results), str(radius_meters), str(lat), str(lon))

    # API call for pages matching location
    geo_results = requests.get(url)

    # Unpack json and convert to dataframe
    df_geo_results = pd.DataFrame(geo_results.json()['query']['geosearch'])
    
    # API call for intro texts of pages
    page_names = list(df_geo_results['title'])
    df_intro_texts = wikipedia_get_intro_texts(page_names)
    
    # Merge intro_texts into geo_results
    df_geo_results = df_geo_results.merge(df_intro_texts, how='left')
    
    # Write to file if needed
    if(output_file!=None):
        print("Results saved to file:", output_file)
        df_geo_results.to_csv(output_file, sep="\t", index=False)
    
    return(df_geo_results)


def visualize_map(df):
    """Visualize lat/lon locations on map using Folium"""
    
    # Start lon/lat
    latitude = df.iloc[0].loc[['lat']]
    longitude = df.iloc[0].loc[['lon']]
    
    # Initialize map
    map = folium.Map(location = [latitude, longitude], zoom_start = 12)
 
    # Instantiate a mark cluster object for the locations in the dataframe
    locations = plugins.MarkerCluster().add_to(map)
 
    # Loop through the dataframe and add each data point to the mark cluster
    for lat, lon, label, in zip(df.lat, df.lon, df.title):
        popup_string = '<a href=" {} "target="_blank"> {} </a>'.format('https://en.wikipedia.org/wiki/'+label, label)
        
        folium.Marker(
            location=[lat, lon],
            icon=None,
            popup = folium.Popup(popup_string),
        ).add_to(locations)

    # Display map
    return(map)


## Retrieve data

In [None]:
# Search wikipedia by location
df_wiki_locations = wikipedia_geosearch(lat = 52.0894444, 
                                 lon = 5.1077981, 
                                 max_results = 200, 
                                 output_file = "wikipedia_utrecht.csv")

## Explore data

In [None]:
df_wiki_locations.head()

## Visualize on a map

In [None]:
visualize_map(df_wiki_locations)

## TO DO: retrieve images

In [None]:
session = requests.Session()

url = "https://en.wikipedia.org/w/api.php"

params = {
    'action':'query',
    'format':'json',
    'prop':'images',
    'titles':'Lunetten|Utrecht Centraal railway station'
}

request = session.get(url=url, params=params)

data = request.json()
results = pd.DataFrame(data['query']['pages']).transpose()

In [None]:
data['query']['pages']['3044682']['images'][2]['title']