In [18]:
import requests
import wikipedia
import re

import pandas as pd
import numpy as np

In [10]:
def generate_category( category):
    '''
    format a category for insertion in to a wikipedia api call
    '''
    category = re.sub('\s','+',category)
    return category

In [11]:
generate_category('machine learning')

'machine+learning'

In [53]:
query = """
        http://en.wikipedia.org/w/api.php?
        action=query&
        format=json&
        list=categorymembers&
        cmtitle=Category:{}&
        cmlimit=max
    """.format( generate_category( 'machine learning'))
re.sub('\s', '', query)

'http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:machine+learning&cmlimit=max'

In [12]:
def generate_query( category):
    '''
    Format an api call for requests
    '''
    query = """
        http://en.wikipedia.org/w/api.php?
        action=query&
        format=json&
        list=categorymembers&
        cmtitle=Category:{}&
        cmlimit=max
    """.format( generate_category( category))
    query = re.sub('\s', '',query)
    return query

In [13]:
generate_query('machine learning' )

'http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:machine+learning&cmlimit=max'

In [20]:
def execute_category_query(category):
    '''
    Executes a category qeury and returns a 
    DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    return pd.DataFrame( response['query']['categorymembers'])

In [21]:
test =execute_category_query('machine learning')

In [22]:
test

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,53198248,Singular statistical model
5,0,3771060,Accuracy paradox
6,0,43808044,Action model learning
7,0,28801798,Active learning (machine learning)
8,0,45049676,Adversarial machine learning
9,0,52642349,AIVA


In [None]:
## Issue is titles with 'Category:' for sub-categories

In [23]:
sub_category_mask = test.title.str.contains('Category:')

In [26]:
test[ sub_category_mask].head()

Unnamed: 0,ns,pageid,title
198,14,33547387,Category:Applied machine learning
199,14,42936114,Category:Artificial neural networks
200,14,1718975,Category:Bayesian networks
201,14,1991254,Category:Classification algorithms
202,14,22532673,Category:Cluster analysis


In [25]:
test[ ~sub_category_mask].head()  ## articles

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,53198248,Singular statistical model


In [27]:
def remove_category( category):
    category = re.sub( 'Category:', '', category)
    return category

In [29]:
## List of sub_categories

categories_to_query = test[sub_category_mask].apply(lambda x: remove_category(x.title), axis = 1 ).tolist()

categories_to_query

['Applied machine learning',
 'Artificial neural networks',
 'Bayesian networks',
 'Classification algorithms',
 'Cluster analysis',
 'Computational learning theory',
 'Artificial intelligence conferences',
 'Data mining and machine learning software',
 'Datasets in machine learning',
 'Dimension reduction',
 'Ensemble learning',
 'Evolutionary algorithms',
 'Genetic programming',
 'Inductive logic programming',
 'Kernel methods for machine learning',
 'Latent variable models',
 'Learning in computer vision',
 'Log-linear models',
 'Loss functions',
 'Machine learning algorithms',
 'Machine learning portal',
 'Machine learning task',
 'Markov models',
 'Machine learning researchers',
 'Semisupervised learning',
 'Statistical natural language processing',
 'Structured prediction',
 'Supervised learning',
 'Support vector machines',
 'Unsupervised learning']

In [31]:
subcat_df_list = []  ## create list of dataframes,  each dataframe has pages for subcategory

for category in categories_to_query:
    subcat_df_list.append( execute_category_query( category))

In [33]:
subcat_df_list[2]

Unnamed: 0,ns,pageid,title
0,0,21523,Artificial neural network
1,0,28016652,Types of artificial neural networks
2,0,14179835,Activation function
3,0,8220913,ADALINE
4,0,31663887,Adaptive neuro fuzzy inference system
5,0,3056879,Adaptive resonance theory
6,0,4231161,ALOPEX
7,0,16167377,Artificial Intelligence System
8,0,349771,Artificial neuron
9,0,51404222,Artisto


In [34]:
def get_all_pages_recursively( category):
    
    category_df = execute_category_query( category)  ## gather pages for category
    pages_list = []  ## create list of pages dataframes
    category_mask = category_df.title.str.contains("Category:")
    pages_df = category_df[ ~category_mask]   ## articles from category
    
    pages_list.append( pages_df)
    
    categories = category_df[ category_mask].title.str.replace("Category:",'').tolist()  ## get all the sub-categories
    
    if len(categories) > 0:
        for cat in categories:
            pages_list.append( get_all_pages_recursively( cat))
            
    
    pages_df = pd.concat( pages_list)
    pages_df.reset_index()
    return pages_df

In [35]:
rec_test = get_all_pages_recursively('machine learning' )

In [41]:
rec_test.reset_index(drop = True).shape

(1606, 3)

In [49]:
rec_test = rec_test.drop_duplicates().reset_index(drop = True)

In [50]:
#rec_test['category'] = 'machine learning'
rec_test

Unnamed: 0,ns,pageid,title,category
0,0,43385931,Data exploration,machine learning
1,0,49082762,List of datasets for machine learning research,machine learning
2,0,233488,Machine learning,machine learning
3,0,53587467,Outline of machine learning,machine learning
4,0,53198248,Singular statistical model,machine learning
5,0,3771060,Accuracy paradox,machine learning
6,0,43808044,Action model learning,machine learning
7,0,28801798,Active learning (machine learning),machine learning
8,0,45049676,Adversarial machine learning,machine learning
9,0,52642349,AIVA,machine learning


In [None]:
def get_whole_category( category):
    df = get_all_pages_recursively( category)
    df = df.drop_duplicates().reset_index( drop = True)
    df['category'] = category
    return df

In [None]:
df = get_whole_category( 'machine learning')

In [None]:
##get_all_pages_recursively('cooking' )