# Scraping Wikipedia pages using geosearch

This notebook uses the Wikipedia API to do a geosearch (based on longitude, latitude, radius) for matching Wikipedia articles, and then retrieves the intro texts of each of the matched articles. Results are exported to a csv file and visualized on a map.

See MediaWiki API: https://www.mediawiki.org/wiki/Extension:GeoData

Written by Dennis van den Berg.

## Libraries

In [1]:
import requests
import pandas as pd
import folium
from folium import plugins
import base64
from IPython.display import HTML
%matplotlib inline

## Functions

In [2]:
def chunks(list, chunksize):
    """Yield successive chunks from list."""
    for chunk_number in range(0, len(list), chunksize):
        yield list[chunk_number : chunk_number + chunksize]


def wikipedia_get_intro_texts(page_names, query_size=10):
    """Get intro texts for list of pages"""
    
    # Create empty dataframe for intro texts
    df_intro_texts_all = pd.DataFrame()
    
    # Split list of page_names into chunks of length query_size and do multiple queries
    for page_names_chunk in chunks(list=page_names, chunksize=query_size):
    
        # Wikipedia intro text query
        page_names_string = '|'.join(page_names_chunk)
        url = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles={}" \
            .format(page_names_string)

        # API call
        intro_texts = requests.get(url)

        # Unpack json and convert to dataframe
        df_intro_texts = pd.DataFrame(intro_texts.json()['query']['pages']).transpose()
        
        # Append results
        df_intro_texts_all = df_intro_texts_all.append(df_intro_texts)
    
    # Cleaning: remove newlines in 'extract' column
    df_intro_texts_all['extract'] = df_intro_texts_all['extract'].str.replace('\n', ' ').replace('\r', '')
    
    return(df_intro_texts_all)


def wikipedia_geosearch(lat, lon, radius_meters = 10000, max_results = 500, output_file=None):
    """Get wikipedia matches by lat/lon coordinates"""
    
    # Wikipedia geosearch query
    url = "https://en.wikipedia.org/w/api.php?action=query&list=geosearch&format=json&gslimit={}&gsradius={}&gscoord={}|{}" \
        .format(str(max_results), str(radius_meters), str(lat), str(lon))

    # API call for pages matching location
    geo_results = requests.get(url)

    # Unpack json and convert to dataframe
    df_geo_results = pd.DataFrame(geo_results.json()['query']['geosearch'])
    
    # API call for intro texts of pages
    page_names = list(df_geo_results['title'])
    df_intro_texts = wikipedia_get_intro_texts(page_names)
    
    # Merge intro_texts into geo_results
    df_geo_results = df_geo_results.merge(df_intro_texts, how='left')
    
    # Write to file if needed
    if(output_file!=None):
        print("Results saved to file:", output_file)
        df_geo_results.to_csv(output_file, sep="\t", index=False)
    
    return(df_geo_results)


def visualize_map(df):
    """Visualize lat/lon locations on map using Folium"""
    
    # Start lon/lat
    latitude = df.iloc[0].loc[['lat']]
    longitude = df.iloc[0].loc[['lon']]
    
    # Initialize map
    map = folium.Map(location = [latitude, longitude], zoom_start = 12)
 
    # Instantiate a mark cluster object for the locations in the dataframe
    locations = plugins.MarkerCluster().add_to(map)
 
    # Loop through the dataframe and add each data point to the mark cluster
    for lat, lon, label, in zip(df.lat, df.lon, df.title):
        popup_string = '<a href=" {} "target="_blank"> {} </a>'.format('https://en.wikipedia.org/wiki/'+label, label)
        
        folium.Marker(
            location=[lat, lon],
            icon=None,
            popup = folium.Popup(popup_string),
        ).add_to(locations)

    # Display map
    return(map)


def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

## Retrieve data

In [3]:
# Search wikipedia by location
df_wiki_locations = wikipedia_geosearch(lat = 52.0894444, 
                                 lon = 5.1077981, 
                                 max_results = 200, 
                                 output_file = "wikipedia_utrecht.csv")

Results saved to file: wikipedia_utrecht.csv


## Convert to download format

In [4]:
# Adding some columns (for data export format)
df_wiki_locations['type'] = 9
df_wiki_locations['picture'] = "blank.jpg"
df_wiki_locations['col7'] = "XYZ"
df_wiki_locations['col9'] = "XYZ"
df_wiki_locations['col10'] = "XYZ"
df_wiki_locations['col11'] = "BLANK"
# Reordering
df_download = df_wiki_locations[['type', 'pageid', 'title', 'picture', 'lon', 'lat', 'col7', 'extract', 'col9', 'col10', 'col11']]

# Make downloadable
create_download_link(df_download)

## Explore data

In [5]:
df_wiki_locations.head()

Unnamed: 0,dist,lat,lon,ns,pageid,primary,title,extract,type,picture,col7,col9,col10,col11
0,135.0,52.089167,5.109722,0,8332984,,Utrecht Centraal railway station,"<p class=""mw-empty-elt""> </p><p><br></p> <p><b...",9,blank.jpg,XYZ,XYZ,XYZ,BLANK
1,266.1,52.089722,5.111667,0,8539939,,Catharijne,<p><b>Catharijne</b> is a former municipality ...,9,blank.jpg,XYZ,XYZ,XYZ,BLANK
2,283.5,52.086944,5.108611,0,35615082,,Rabobank Bestuurscentrum,<p>The <b>Rabobank Bestuurscentrum</b> or <b>R...,9,blank.jpg,XYZ,XYZ,XYZ,BLANK
3,334.0,52.0868,5.10548,0,32706904,,Jaarbeurs,<p>The <b>Jaarbeurs</b> (Yearly Fair) is an ex...,9,blank.jpg,XYZ,XYZ,XYZ,BLANK
4,488.3,52.092222,5.113333,0,28276794,,TivoliVredenburg,<p>The <b>TivoliVredenburg</b> is a contempora...,9,blank.jpg,XYZ,XYZ,XYZ,BLANK


## Visualize on a map

In [6]:
visualize_map(df_wiki_locations)

## TO DO: retrieve images

In [7]:
session = requests.Session()

url = "https://en.wikipedia.org/w/api.php"

params = {
    'action':'query',
    'format':'json',
    'prop':'images',
    'titles':'Lunetten|Utrecht Centraal railway station'
}

request = session.get(url=url, params=params)

data = request.json()
results = pd.DataFrame(data['query']['pages']).transpose()

In [8]:
data['query']['pages']['3044682']['images'][2]['title']

'File:Map - NL - Utrecht - Wijk 07 Zuid - Subwijk Lunetten.svg'