In [1]:
import re
import requests
import pandas as pd
import numpy as np

#### Below is the wikipedia api call for a category search:

`http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3A+machine+learning&cmlimit=max`

`action=query`: query the wikipedia api

`format=json`: return a json format

`list=categorymembers`: List of pages that belong to a given category, ordered by page sort title

`cmtitle=Category%3A+machine+learning`: title of category

`climit=max`: return up to the maximum amount of responses (500)

You may use this to get page titles from the wikipedia API. Things to watch out for:
* The responses contain categories
* You will want to fetch articles in those subcategories

The API's detailed documentation can be found [here](https://www.mediawiki.org/wiki/API:Main_page)

In [2]:
r = requests.get('http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3A+machine+learning&cmlimit=max')

In [3]:
r.url

'https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3A+machine+learning&cmlimit=max'

In [4]:
r.status_code # 200 means it worked!

200

In [5]:
r.json().keys()

dict_keys(['batchcomplete', 'limits', 'query'])

In [6]:
r.json()['query']['categorymembers']

[{'ns': 0, 'pageid': 43385931, 'title': 'Data exploration'},
 {'ns': 0,
  'pageid': 49082762,
  'title': 'List of datasets for machine learning research'},
 {'ns': 0, 'pageid': 233488, 'title': 'Machine learning'},
 {'ns': 0, 'pageid': 53587467, 'title': 'Outline of machine learning'},
 {'ns': 0, 'pageid': 3771060, 'title': 'Accuracy paradox'},
 {'ns': 0, 'pageid': 43808044, 'title': 'Action model learning'},
 {'ns': 0, 'pageid': 28801798, 'title': 'Active learning (machine learning)'},
 {'ns': 0, 'pageid': 45049676, 'title': 'Adversarial machine learning'},
 {'ns': 0, 'pageid': 52642349, 'title': 'AIVA'},
 {'ns': 0, 'pageid': 30511763, 'title': 'AIXI'},
 {'ns': 0, 'pageid': 50773876, 'title': 'Algorithm selection'},
 {'ns': 0, 'pageid': 20890511, 'title': 'Algorithmic inference'},
 {'ns': 0, 'pageid': 49242352, 'title': 'AlphaGo'},
 {'ns': 0, 'pageid': 55572262, 'title': 'AlphaGo Zero'},
 {'ns': 0, 'pageid': 19463198, 'title': 'Apprenticeship learning'},
 {'ns': 0, 'pageid': 14003441,

In [7]:
query_df = pd.DataFrame(r.json()['query']['categorymembers'])
query_df

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,3771060,Accuracy paradox
5,0,43808044,Action model learning
6,0,28801798,Active learning (machine learning)
7,0,45049676,Adversarial machine learning
8,0,52642349,AIVA
9,0,30511763,AIXI


#### Make a function that formats a request for pages of a category

In [8]:
categories = ['Machine Learning', 'Business Software']

In [9]:
# use regex to replace name of category in the search string
#'http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3A+machine+learning&cmlimit=max'
format_categories = [re.sub('\s', '_', cat).lower() for cat in categories] # replace spaces in category with +s so can insert into search string
#&cmtitle=Category%3A+machine+learning&

In [10]:
format_categories

['machine_learning', 'business_software']

#### Make a function that uses requests to execute the query and returns the json

In [11]:
def category_page_query(category):
    query_params = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtype": "page",
        "cmlimit": "max",
        "cmtitle": "Category:{}".format(category)

    }

    query = requests.get('http://en.wikipedia.org/w/api.php', query_params)

    return query.json()

In [12]:
cat_queries = []

for cat in ['machine_learning']:
    tmp = category_page_query(cat)
    cat_queries.append(tmp)

In [13]:
cat_queries[0]['query']['categorymembers']

[{'ns': 0, 'pageid': 43385931, 'title': 'Data exploration'},
 {'ns': 0,
  'pageid': 49082762,
  'title': 'List of datasets for machine learning research'},
 {'ns': 0, 'pageid': 233488, 'title': 'Machine learning'},
 {'ns': 0, 'pageid': 53587467, 'title': 'Outline of machine learning'},
 {'ns': 0, 'pageid': 3771060, 'title': 'Accuracy paradox'},
 {'ns': 0, 'pageid': 43808044, 'title': 'Action model learning'},
 {'ns': 0, 'pageid': 28801798, 'title': 'Active learning (machine learning)'},
 {'ns': 0, 'pageid': 45049676, 'title': 'Adversarial machine learning'},
 {'ns': 0, 'pageid': 52642349, 'title': 'AIVA'},
 {'ns': 0, 'pageid': 30511763, 'title': 'AIXI'},
 {'ns': 0, 'pageid': 50773876, 'title': 'Algorithm selection'},
 {'ns': 0, 'pageid': 20890511, 'title': 'Algorithmic inference'},
 {'ns': 0, 'pageid': 49242352, 'title': 'AlphaGo'},
 {'ns': 0, 'pageid': 55572262, 'title': 'AlphaGo Zero'},
 {'ns': 0, 'pageid': 19463198, 'title': 'Apprenticeship learning'},
 {'ns': 0, 'pageid': 14003441,

#### Make a function that turns the json into a DataFrame

hint: you can't just make a DataFrame of the json

In [14]:
cat_queries[0]['query']['categorymembers']

[{'ns': 0, 'pageid': 43385931, 'title': 'Data exploration'},
 {'ns': 0,
  'pageid': 49082762,
  'title': 'List of datasets for machine learning research'},
 {'ns': 0, 'pageid': 233488, 'title': 'Machine learning'},
 {'ns': 0, 'pageid': 53587467, 'title': 'Outline of machine learning'},
 {'ns': 0, 'pageid': 3771060, 'title': 'Accuracy paradox'},
 {'ns': 0, 'pageid': 43808044, 'title': 'Action model learning'},
 {'ns': 0, 'pageid': 28801798, 'title': 'Active learning (machine learning)'},
 {'ns': 0, 'pageid': 45049676, 'title': 'Adversarial machine learning'},
 {'ns': 0, 'pageid': 52642349, 'title': 'AIVA'},
 {'ns': 0, 'pageid': 30511763, 'title': 'AIXI'},
 {'ns': 0, 'pageid': 50773876, 'title': 'Algorithm selection'},
 {'ns': 0, 'pageid': 20890511, 'title': 'Algorithmic inference'},
 {'ns': 0, 'pageid': 49242352, 'title': 'AlphaGo'},
 {'ns': 0, 'pageid': 55572262, 'title': 'AlphaGo Zero'},
 {'ns': 0, 'pageid': 19463198, 'title': 'Apprenticeship learning'},
 {'ns': 0, 'pageid': 14003441,

In [16]:
machine_learning_df.tail()

Unnamed: 0,ns,pageid,title
193,0,39945557,Validation set
194,0,43502368,Vanishing gradient problem
195,0,7578809,Version space learning
196,0,52992310,VGG Image Annotator
197,0,47527969,Word2vec


In [44]:
from time import sleep

#### Extra: Build out methods that take any subcategories and get the articles in those.

Hint: recursion

In [45]:
def category_page_query(category):
    query_params = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtype": "page",
        "cmlimit": "max",
        "cmtitle": "Category:{}".format(category)

    }

    query = requests.get('http://en.wikipedia.org/w/api.php', query_params)

    return query.json()

def category_subcat_query(category):
    query_params = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtype": "subcat",
        "cmlimit": "max",
        "cmtitle": "Category:{}".format(category)

    }

    query = requests.get('http://en.wikipedia.org/w/api.php', query_params)

#     print(query.url)
    return query.json()

def clean_cat_text(category):
    cat_text = category.replace(' ', '_')
    cat_text = cat_text.replace("Category:", "")
    cat_text = cat_text.lower()
    
    return cat_text

In [46]:
def get_category_pages(category):
    
#     print("Working on", category)
    pagelist = []
    clean_category = clean_cat_text(category)
    
    #get_subcategories from current category
    subcat_response = category_subcat_query(clean_category)
    
    n_sub = len(subcat_response['query']['categorymembers'])
    sub_df = pd.DataFrame(subcat_response['query']['categorymembers'])
    
    # if there are subcategories, clean the text and recursively call get_category_pages
    if n_sub >= 1:
        for sub in sub_df['title']:
            sleep(0.1)
            sub_clean = clean_cat_text(sub)
            pagelist.extend(get_category_pages(sub_clean))
    
    #the the pages info from the current category 
    page_response = category_page_query(clean_category)
    
    pagelist.extend(page_response['query']['categorymembers'])
        
        
    return pagelist

In [47]:
category_list = ['Machine Learning', 'Business Software', 'Computer Science', 'Games', 'Sports']
    
pages_df = pd.DataFrame(columns=['ns', 'pageid', 'title'])

for category in category_list:
    tmp_list = get_category_pages(category)
    tmp_df = pd.DataFrame(tmp_list)
    pages_df = pages_df.append(tmp_df)
    sleep(2)
    
pages_df = pages_df.drop_duplicates()

KeyboardInterrupt: 

In [49]:
pages_df.shape
pages_df.to_csv('data/list_of_pages.csv', index=False)

(1623, 3)