In [3]:
import re
import pandas as pd
import requests

In [1]:
def generate_category(category):
    '''
    format a category for insertion in to a wikipedia api call
    '''
    category = re.sub('\s','+',category)
    return category

In [4]:
def generate_query(category):
    '''
    Format an api call for requests
    '''
    query = """
            http://en.wikipedia.org/w/api.php?
            action=query&
            format=json&
            list=categorymembers&
            cmtitle=Category:{}& 
            cmlimit=max
            """.format(generate_category(category))
    query = re.sub('\s','',query)
    return query

In [7]:
def execute_category_query(category):
    '''
    Executes a category qeury and returns a 
    DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    return pd.DataFrame(response['query']['categorymembers'])

In [8]:
test = execute_category_query('machine learning')

In [10]:
test

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,53198248,Singular statistical model
5,0,3771060,Accuracy paradox
6,0,43808044,Action model learning
7,0,28801798,Active learning (machine learning)
8,0,45049676,Adversarial machine learning
9,0,52642349,AIVA


In [11]:
category_mask = test['title'].str.contains('Category:')

In [14]:
test[~category_mask].head()

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,53198248,Singular statistical model


In [15]:
def remove_category(category):
    category = re.sub('Category:', '', category)
    return category

In [17]:
test[category_mask]['title'].apply(remove_category)[:10]

198                     Applied machine learning
199                   Artificial neural networks
200                            Bayesian networks
201                    Classification algorithms
202                             Cluster analysis
203                Computational learning theory
204          Artificial intelligence conferences
205    Data mining and machine learning software
206                 Datasets in machine learning
207                          Dimension reduction
Name: title, dtype: object

In [18]:
subcat_df_list = []


In [19]:
categories_to_query = test[category_mask]['title'].apply(remove_category).tolist()

In [20]:
categories_to_query

['Applied machine learning',
 'Artificial neural networks',
 'Bayesian networks',
 'Classification algorithms',
 'Cluster analysis',
 'Computational learning theory',
 'Artificial intelligence conferences',
 'Data mining and machine learning software',
 'Datasets in machine learning',
 'Dimension reduction',
 'Ensemble learning',
 'Evolutionary algorithms',
 'Genetic programming',
 'Inductive logic programming',
 'Kernel methods for machine learning',
 'Latent variable models',
 'Learning in computer vision',
 'Log-linear models',
 'Loss functions',
 'Machine learning algorithms',
 'Machine learning portal',
 'Machine learning task',
 'Markov models',
 'Machine learning researchers',
 'Semisupervised learning',
 'Statistical natural language processing',
 'Structured prediction',
 'Supervised learning',
 'Support vector machines',
 'Unsupervised learning']

In [21]:
for category in categories_to_query:
    subcat_df_list.append(execute_category_query(category))

In [22]:
subcat_df_list

[    ns    pageid                                   title
 0    0  15795950                    Activity recognition
 1    0  41916168                              AlchemyAPI
 2    0  53631046                        Caffe (software)
 3    0  49119569    Comparison of deep learning software
 4    0  41916447                                 Cortica
 5    0  34529351                      DARPA LAGR Program
 6    0  43169442                          Deeplearning4j
 7    0  38818825                                 Diffbot
 8    0  41184517                            Google Brain
 9    0  46222904                         Intel RealSense
 10   0  35456221                                 IRCF360
 11   0    705605                             Jabberwacky
 12   0  31663650                                  Kaggle
 13   0  51650259                                   Keras
 14   0  23048428                                  Kinect
 15   0  23762260  Language Acquisition Device (computer)
 16   0  48976

In [45]:
def get_all_pages_rec(category):
    category_df = execute_category_query(category)
    pages_list = []
    category_mask = category_df['title'].str.contains('Category:')
    
    pages_df = category_df[~category_mask]
    pages_list.append(pages_df)
    
    categories = category_df[category_mask]['title']\
                            .str.replace('Category:','').tolist()
    if len(categories) > 0:
        for cat in categories:
            pages_list.append(get_all_pages_rec(cat))
    
    pages_df = pd.concat(pages_list)
    pages_df.reset_index()
    return pages_df

In [46]:
rec_test = get_all_pages_rec('machine learning')

In [41]:
rec_test.sample(5)

Unnamed: 0,ns,pageid,title
93,0,41370976,Kernel embedding of distributions
154,0,37697003,Random indexing
74,0,10711453,Long short-term memory
2,0,14179835,Activation function
27,0,43932548,Random projection


In [40]:
rec_test.shape

(1606, 3)

In [39]:
rec_test.drop_duplicates().shape

(1104, 3)

In [42]:
get_all_pages_rec('business software')

KeyError: 'title'

In [43]:
def get_whole_category (category):
    df = get_all_pages_rec(category)
    df = df.drop_duplicates().reset_index(drop=True)
    df['category'] = category
    return df

In [44]:
gwc_test = get_whole_category('machine learning')