In [1]:
import requests
import wikipedia
import pandas as pd
import numpy as np

import re

https://www.mediawiki.org/wiki/API:Main_page

## CRUD

| | SQL | RESTful API |
|:-:|:-:|:-:|
| create | `INSERT` | `POST` |
| read | `SELECT` | `GET` |
| update | `UPDATE` | `PUT` |
| delete | `DELETE` | `DELETE` |

![](http://interactive.blockdiag.com/image?compression=deflate&encoding=base64&src=eJxLyslPzk7JTExXqOZSUFAPcnV0UUgrys9VCM_MzixIBcqoK-jaKRSlFpfmlBSDmOrlRZklqQol-QoF-cUl6UAZdWuuWgBXuhe4)

## Query Wikipedia

## API Tutorial (mediawiki API)



## Raw Get on Wikipedia

In [2]:
def category_format( category):
    category_query = re.sub( '\s', '+', category)
    return category_query
    

In [3]:
def request_category( category, ):  ##  exclude
    
    category_query = category_format( category)
    
    base_url = 'https://en.wikipedia.org/w/api.php'
    action_tag = "?action=query&list=categorymembers&cmlimit=max" ## fetch all category members (pages, subcategories)
    category_tag = '&cmtitle=Category:{}'.format( category) ## append category to cat_tag
    parameters_tag = "&format=json&prop=info|categories|links" ## return in json format
    request_call = base_url + action_tag + category_tag + parameters_tag ## concatenate base_url with request tags

    r = requests.get( request_call)  ## request HTTP results
    response = r.json()
    category_pages_df = pd.DataFrame( response['query']['categorymembers'] )
    
    return category_pages_df
    
    
    

In [5]:
category_pages_df = request_category( 'machine learning')
sub_category_mask = category_pages_df.title.str.contains('Category:')  ## row mask for only sub-categories
category_pages_df.head()

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,53198248,Singular statistical model


In [7]:
category_pages_df[ ~sub_category_mask].head()

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,53198248,Singular statistical model


In [6]:
## sub-categories

category_pages_df[ sub_category_mask].head()

Unnamed: 0,ns,pageid,title
198,14,33547387,Category:Applied machine learning
199,14,42936114,Category:Artificial neural networks
200,14,1718975,Category:Bayesian networks
201,14,1991254,Category:Classification algorithms
202,14,22532673,Category:Cluster analysis


In [18]:
def gather_articles( category ):
    '''Collect all articles that belong to a category, including articles found in sub-categories of category
        that are not already included under the category explicitly'''
    category_pages_df = request_category( category)  ## load dataframe of pages contained in category
    sub_category_mask = category_pages_df.title.str.contains('Category:')  ## row mask for only sub-categories
    
    pages_df_list = []
    pages_df = category_pages_df[ ~sub_category_mask]
    pages_df_list.append( pages_df)
    
    sub_categories = category_pages_df[ sub_category_mask].title.str.replace( 'Category:', '').tolist()    ## Create list of all sub-category names ( Category: <name>), remove the preface                
    n_sub_categories = sum( sub_category_mask)  ## Number of sub-categories belong to category
    
    if n_sub_categories > 0:
        for sub_category in sub_categories:  ## recursively, one at a time     
            
            pages_df_list.append( gather_articles( sub_category ) )
            

    pages_df = pd.concat( pages_df_list)
    pages_df.reset_index()
    return pages_df
    
    
    #pages_df_list.append(  category_pages_df[~sub_category_mask].pageid.apply( grab_content ))
    
    

In [None]:
#pages_df.loc[:, 'content'] = pages_df.pageid.apply( grab_content)

In [19]:
ml_articles = gather_articles( 'machine learning')

In [10]:
ml_articles.shape

(1606, 3)

In [21]:
ml_articles

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,53198248,Singular statistical model
5,0,3771060,Accuracy paradox
6,0,43808044,Action model learning
7,0,28801798,Active learning (machine learning)
8,0,45049676,Adversarial machine learning
9,0,52642349,AIVA


In [None]:
sub_category_mask = category_pages_df.title.str.contains( 'Category:')
category_pages_df[ sub_category_mask].title.replace( 'Category:', '').tolist()

len( category_pages_df[ ~sub_category_mask].pageid.tolist()), len(category_pages_df[ ~sub_category_mask].pageid.unique() )





In [None]:
def grab_content( page_id):
    try:
        page_content = wikipedia.WikipediaPage(pageid = page_id).content
    except: 
        page_content = ''
    return page_content
    

In [None]:
def request_category( category, sub_category = False):  ## return
    '''Request from wikipedia API for category pages (articles, subcategories) 
            return:  JSON object with category pages  '''
    if sub_category:
        cat_tag = '&cmtitle=Category:' + sub_category ## append category to cat_tag
    else: 
        cat_tag = '&cmtitle=Category:' + category ## append category to cat_tag
    
    base_url = 'https://en.wikipedia.org/w/api.php'
    action_tag = "?action=query&list=categorymembers&cmlimit=max" ## fetch all category members (pages, subcategories)
    #category_tag =  cat_tag#'&cmtitle=Category:' + category ## append category to cat_tag
    parameters_tag = "&format=json&prop=info|categories|links" #&prop=categories|links|info" ## return in json format
    request_call = base_url + action_tag + cat_tag + parameters_tag ## concatenate base_url with request tags
    request = requests.get( request_call)  ## request HTTP results
    
    category_pages = request.json()['query']['categorymembers']  ## list object containing category pages ( articles, sub-categories)
    
    pages = {}
    for i, page_info in enumerate( category_pages):
        page_id = page_info['pageid']
        page_title = page_info['title']
        if sub_category:
            pages[i] ={'category': category, 'sub-category': sub_category, 'pageid': page_id, 'title': page_title, 'content':np.nan } 
        else:
            pages[i] ={'category': category, 'sub-category': None, 'pageid': page_id, 'title': page_title, 'content':np.nan } 
    
    pages_df = pd.DataFrame.from_dict( pages, orient = 'index')
    
    pages_df['title']
    
    return pages_df
    
    

In [None]:
def gather_content( page_df, condition = True):
    
    category = page_df.category.unique()[0]

    sub_category_mask = page_df.title.str.contains( 'Category:')  ## row mask 
    
    new_articles_mask = ~sub_category_mask & page_df.content.isnull()
     
    sub_category_indices = page_df[sub_category_mask].index.tolist()
    ## gather all the article content for the each page in the category, excluding the sub_categories
    page_df.loc[new_articles_mask, 'content'] = page_df[ new_articles_mask ].apply( lambda x: grab_content( x.pageid), axis = 1 )
    
    n_sub_categories = sum( sub_category_mask)
    if n_sub_categories == 0:
        condition = False
        return page_df
    
    
    while condition:
        
        subCat_indice = sub_category_indices[0] ## grab the first one
        
        
        subCat = page_df.iloc[subCat_indice,:]['title'].split('Category:')[1] # subCat = 
        
        page_df = page_df.append( request_category( category, subCat ), ignore_index= True)
        
        
        page_df.drop( page_df.index[ subCat_indice], inplace = True )  ## Remove the original page
        
        gather_content( page_df)
        
        

In [None]:
page_df = request_category('Machine_learning')


In [None]:
page_df.head()

In [None]:
page_df.tail(31)

In [None]:
category_df = gather_content( page_df)

In [None]:
page_df.head()

In [None]:
page_df.category.unique()[0]

In [None]:
arr[0]

In [None]:
sub_category_mask = page_df.title.str.contains( 'Category:')

In [None]:
page_df.shape, sum( sub_category_mask), sum( ~sub_category_mask)

In [None]:
sub_category_mask = page_df_test.title.str.contains( 'Category:')

sum(~sub_category_mask), page_df_test.content.isnull().sum()

In [None]:
new_articles_mask = ~sub_category_mask & page_df_test.content.isnull()
sum(new_articles_mask)

In [None]:
sub_category_mask = page_df.title.str.contains( 'Category:')

new_articles_mask = ~sub_category_mask & page_df.content.isnull()
    
n_articles = sum( ~sub_category_mask)
n_sub_categories = sum( sub_category_mask)

#print( '{} requests to be made'.format( n_articles * n_sub_categories) )
## First gather all the article content for the each page in the category
page_df.loc[~sub_category_mask, 'content'] = page_df[ ~sub_category_mask ].apply( lambda x: grab_content( x.pageid), axis = 1 )
    

In [None]:
page_df.tail(31)

In [None]:
category = page_df.category.unique()[0]
category

In [None]:
sub_category_indices = page_df[sub_category_mask].index.tolist()



subCat_indice = sub_category_indices[0] ## grab the first one
print(subCat_indice)

In [None]:
subCat = page_df.iloc[subCat_indice,:]['title'].split('Category:')[1] # subCat = 
print( subCat)

In [None]:
sub_category_pages_df = request_category( category, subCat )

page_df = page_df.append( sub_category_pages_df, ignore_index= True)



In [None]:
page_df.drop( page_df.index[262:], inplace = True)

In [None]:
page_df.tail(65)

In [None]:
print( page_df.loc[ subCat_indice, :])


page_df.drop( page_df.index[ subCat_indice], inplace = True )  ## Remove the original page
page_df.tail(65)

In [None]:
sub_category_mask = page_df.title.str.contains( 'Category:')  ## row mask 

In [None]:
sum(sub_category_mask)

In [None]:

new_articles_mask = ~sub_category_mask & page_df.content.isnull()

In [None]:
page_df[new_articles_mask]

In [None]:
page_df.content.isnull()

In [None]:
category = page_df.category.unique()[0]

sub_category_indices = df[sub_category_mask].index.tolist()

subCat_indice = sub_category_indices[0] ## grab the first one
print(subCat_indice)


subCat = page_df.iloc[subCat_indice,:]['title'].split('Category:')[1] # subCat = 
print( subCat)
request_category( category, subCat )

#page_df = page_df.append( request_category( category, subCat ), ignore_index= True) # 

#print( page_df.loc[ subCat_indice, :])

#page_df.tail()
#page_df.drop( page_df.index[ sub_Cat_indice] )  ## Remove the original page


In [None]:
page_df

In [None]:
page_df.iloc[455,:]

In [None]:
page_df[ page_df['sub-category'] == 'Applied machine learning']

In [None]:
page_df['sub-category'].isnull().sum()

In [None]:
page_df.shape

In [None]:
page_df.iloc[198,:]

In [None]:
n_sub_categories


In [None]:
for i in range( ):
    page_df.append( request_category( page_df.category.unique()[0], df[sub_category_mask].iloc[i,:]['title'].split('Category:')[1]), ignore_index= True)            
    # page_df.drop()  ## remove original page

In [None]:
sub_category_indices = df[sub_category_mask].index.tolist()

subCat_indice = sub_category_indices[0] ## grab the first one

subCat = page_df.iloc[subCat_indice,:]['title'].split('Category:')[1] # subCat = 
        
page_df.append( request_category( category, subCat ), ignore_index= True)


page_df.drop( page_df.index[ sub_Cat_indice] )  ## Remove the original page


In [None]:
grab_content( 39945557)

In [None]:
~sub_category_mask

In [None]:



df.loc[~sub_category_mask, 'content'] = df[ ~sub_category_mask ].apply( lambda x: grab_content( x.pageid), axis = 1 )
#sub_df.apply( lambda x: grab_content(x.pageid), axis = 1) # grab_content(

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df[sub_category_mask]

In [None]:
df[sub_category_mask]

In [None]:
df[sub_category_mask].apply( lambda x: request_category('Machine_learning', x.title.split('Category:')[1]), axis = 1 )#, axis = 1)
#

In [None]:
df.append(request_category( 'Machine_learning', df[sub_category_mask].apply( lambda x: x.title.split('Category:')[1]), ignore_index= True )

In [None]:
df.append(request_category( 'Machine_learning', df[sub_category_mask].iloc[0,:]['title'].split('Category:')[1]), ignore_index= True)            

In [None]:
df[sub_category_mask].apply(  lambda x: request_category( x['title'].split('Category:')[1]), axis = 1)   # 
## 
                        
                        
                        
                        

In [None]:
df[sub_category_mask]

In [None]:
sub_category_mask = df.title.str.contains( 'Category:')


article_df = df[ ~sub_category_mask ]

In [None]:
sub_df = article_df.loc[:10,:]

In [None]:
sub_df

In [None]:
sub_df.loc[:, 'content'] = sub_df.apply( lambda x: grab_content(x.pageid), axis = 1) # grab_content(

In [None]:
sub_df.head()

In [None]:
ml_json = request_category('Machine_learning').json()

ml_pages  = ml_json['query']['categorymembers']  ## List object

In [None]:


ml_pages  = ml_json['query']['categorymembers']  ## List object
N_pages = len(ml_pages)

In [None]:
ml_pages[-1]['title'].split('Category:')[1]

In [None]:
pages = {}

In [None]:
ml_json['query'].keys()

In [None]:
pages = {}
for i, page_info in enumerate( category_pages):
    page_id = page_info['pageid']
    page_title = page_info['title']
    
    pages[i] ={'pageid': page_id, 'title': page_title, 'content': page_content, 'category': category} 
    
pages_df = pd.DataFrame.from_dict( pages, orient = 'index')

    

In [None]:
def gather_content( category, category_pages):
    
    request_category( category).json()
    

    pages = {}

    for i, page_info in enumerate( category_pages):

        page_type = page_info['ns']
        page_id = page_info['pageid']
        page_title = page_info['title']

        if page_type == 0:  ## category
            
            page_content = wikipedia.WikipediaPage(pageid = page_id).content

            pages[i] ={'pageid': page_id, 'title': page_title, 'conent': page_content, 'category':} 
            
        else:  ## sub-category
            sub_category = page_title.split('Category:')[1]
            
            sub_category_pages = request_category( sub_category).json()['query']['categorymembers']
            
            gather_content( category, sub_category_pages)
            
        
        
    

In [None]:
def gather_content( category, category_pages):
    
    

    pages = {}

    for i, page_info in enumerate( ml_pages[0:10]):

        page_type = page_info['ns']
        page_id = page_info['pageid']
        page_title = page_info['title']

        if page_type == 14:  ## sub-category
            
            sub_category = page_title.split('Category:')[1]
            
            sub_category_json = request_category( sub_category).json()
            
            
            

        #else:  ## article
            #page_content = wikipedia.WikipediaPage(pageid = page_id).content

            #pages[i] ={'pageid': page_id, 'title': page_title, 'conent': page_content, 'category':} 
        
#pages        
        
        
        
    

In [None]:
pages

In [None]:
pd.DataFrame.from_dict( pages, orient = 'index')

In [None]:
##Get all the pages in a Category

catJson = ml.json() ## dict
catJson.keys()



In [None]:
catJson

In [None]:
catPages = catJson['query']['categorymembers']
print( len( catPages))
print( catPages[0:5], sep = '\n' )

In [None]:
catPages

In [None]:
articles = {}

subcats = {}

for i, page in enumerate( catPages):
    
    pageid = page['pageid']
    title = page['title']
    
    if 'Category' not in title:

    
    

In [None]:
pd.DataFrame()

In [None]:
#wikipedia.WikipediaPage(pageid = 50222574).links
wikipedia.WikipediaPage(pageid = 39945557)

In [None]:
base_url = 'https://en.wikipedia.org/w/api.php'

In [None]:
action = "?action=query&list=categorymembers&cmlimit=max" ## &generator=allpages |allpages
#action = "?action=mobileview"
parameters = "&format=json&prop=categories|links|info" #  " # &sections=all &rvprop=content
category = '&cmtitle=Category:'
#titles = "&titles="
#page = "&page="

In [None]:
#category = "Machine_learning"

url = base_url + action + parameters + category + 'Category:Machine_learning'


In [None]:
ml = requests.get( url)

In [None]:
ml.json()