In [None]:
!pip install wikipedia

In [4]:
import requests
import wikipedia
import pandas as pd
import numpy as np

import re

https://www.mediawiki.org/wiki/API:Main_page

## CRUD

| | SQL | RESTful API |
|:-:|:-:|:-:|
| create | `INSERT` | `POST` |
| read | `SELECT` | `GET` |
| update | `UPDATE` | `PUT` |
| delete | `DELETE` | `DELETE` |

![](http://interactive.blockdiag.com/image?compression=deflate&encoding=base64&src=eJxLyslPzk7JTExXqOZSUFAPcnV0UUgrys9VCM_MzixIBcqoK-jaKRSlFpfmlBSDmOrlRZklqQol-QoF-cUl6UAZdWuuWgBXuhe4)

## Query Wikipedia

## API Tutorial (mediawiki API)



## Raw Get on Wikipedia

In [None]:
request_category( 'business software')

In [5]:
def category_format( category):
    category_query = re.sub( '\s', '+', category)
    return category_query
    

In [6]:
def request_category( category ):  ##  exclude  ## Cmtype
    
    #category_query = category_format( category)
    
    base_url = 'https://en.wikipedia.org/w/api.php'
    action_tag = "?action=query&list=categorymembers&cmlimit=max" ## fetch all category members (pages, subcategories)
    category_tag = '&cmtitle=Category:{}'.format( category) ## append category to cat_tag
    parameters_tag = "&format=json&prop=info|categories|links" ## return in json format
    request_call = base_url + action_tag + category_tag + parameters_tag ## concatenate base_url with request tags

    r = requests.get( request_call)  ## request HTTP results
    response = r.json()
    
    category_pages = response['query']['categorymembers']
    
    
    category_pages_df = pd.DataFrame( response['query']['categorymembers'] )
    
    return category_pages_df
    
    
    

In [None]:
category_pages_df = request_category( 'machine learning')
sub_category_mask = category_pages_df.title.str.contains('Category:')  ## row mask for only sub-categories
category_pages_df.head()

In [None]:
category_pages_df[ ~sub_category_mask].head()

In [None]:
## sub-categories

orig_categories = category_pages_df[ sub_category_mask].title.str.replace('Category:','')

In [7]:
## Original 

def gather_articles( category ):
    '''Collect all articles that belong to a category, including articles found in sub-categories.
        This process is performed recursively to find all relevant articles in the domain space'''
    print(category)
    category_pages_df = request_category( category)  ## load dataframe of pages contained in category
    
    sub_category_mask = category_pages_df.title.str.contains('Category:')  ## row mask for only sub-categories
    
    pages_df_list = []
    categories_pageid_dict = {}
    
    pages_df = category_pages_df[ ~sub_category_mask]
    pages_df_list.append( pages_df)
    
    sub_categories = category_pages_df[ sub_category_mask].title.str.replace( 'Category:', '').tolist()    ## Create list of all sub-category names ( Category: <name>), remove the preface                
    categories_pageid_dict[ category] = pages_df.pageid
    n_sub_categories = sum( sub_category_mask)  ## Number of sub-categories belong to category
    
    if n_sub_categories > 0:
        for sub_category in sub_categories:  ## recursively, one at a time     
            sub_category_pages = gather_articles( sub_category )[0]
            
            categories_pageid_dict[ sub_category] = sub_category_pages.pageid
            
            
            pages_df_list.append( sub_category_pages  )
            

    pages_df = pd.concat( pages_df_list)
    pages_df.reset_index()
    return pages_df, categories_pageid_dict
    
    
    #pages_df_list.append(  category_pages_df[~sub_category_mask].pageid.apply( grab_content ))
    
    

In [None]:
## NEW,  Added error handling for empty category

def gather_articles( category):  #  used_subcategories = [] 
    '''Collect all articles that belong to a category, including articles found in sub-categories.
        This process is performed recursively to find all relevant articles in the domain space'''
    
    category_pages_df = request_category( category)  ## load dataframe of pages contained in category
    
    sub_category_mask = category_pages_df.title.str.contains('Category:')  ## row mask for only sub-categories

    pages_df_list = []
    categories_pageid_dict = {}

    pages_df = category_pages_df[ ~sub_category_mask]
    pages_df_list.append( pages_df)

    sub_categories = category_pages_df[ sub_category_mask].title.str.replace( 'Category:', '').tolist()    ## Create list of all sub-category names ( Category: <name>), remove the preface                
    categories_pageid_dict[ category] = pages_df.pageid
    n_sub_categories = sum( sub_category_mask)  ## Number of sub-categories belong to category

    if n_sub_categories > 0:
        for sub_category in sub_categories:  ## recursively, one at a time
            try:
                if sub_category not in categories_pageid_dict.keys():  ## skip categories already visited
                #try:
                    print('New sub-category: {}'.format( sub_category) )
                    sub_category_pages = gather_articles( sub_category )#[0]

                    categories_pageid_dict[ sub_category] = sub_category_pages.pageid


                    pages_df_list.append( sub_category_pages  )
                #except: 
                #    print( '+++Category: {} is empty'.format(sub_category) )
                #    continue
                if sub_category in categories_pageid_dict.keys():
                    print( '***Category: {} already looked at'.format( sub_category))
                    continue
            except: 
                print( '+++Category: {} is empty'.format(sub_category) )
                continue
            
                

    pages_df = pd.concat( pages_df_list)
    #pages_df.reset_index()
    return pages_df, categories_pageid_dict

    #return 'Empty Category'


#pages_df_list.append(  category_pages_df[~sub_category_mask].pageid.apply( grab_content ))



In [None]:
## NEW,  Added error handling for empty category

def gather_articles( category, count = 1):  #  used_subcategories = [] 
    '''Collect all articles that belong to a category, including articles found in sub-categories.
        This process is performed recursively to find all relevant articles in the domain space'''
    
    category_pages_df = request_category( category)  ## load dataframe of pages contained in category
    
    sub_category_mask = category_pages_df.title.str.contains('Category:')  ## row mask for only sub-categories

    pages_df_list = []
    if count == 1:
        categories_pageid_dict = {}

    pages_df = category_pages_df[ ~sub_category_mask]
    pages_df_list.append( pages_df)

    sub_categories = category_pages_df[ sub_category_mask].title.str.replace( 'Category:', '').tolist()    ## Create list of all sub-category names ( Category: <name>), remove the preface                
    categories_pageid_dict[ category] = pages_df.pageid
    #print( 'added category to dict - {}'.format(category))
    n_sub_categories = sum( sub_category_mask)  ## Number of sub-categories belong to category

    if n_sub_categories > 0:
        for sub_category in sub_categories:  ## recursively, one at a time
            #if sub_category not in categories_pageid_dict:  ## skip categories already visited
            try:
                #print('New sub-category: {}'.format( sub_category) )
                if sub_category in categories_pageid_dict.keys():
                    print( '***Category: {} already looked at'.format( sub_category))
                    continue

                #print('done')
                if sub_category not in categories_pageid_dict.keys():
                    sub_category_pages = gather_articles( sub_category, count + 1 )[0]
                    categories_pageid_dict[ sub_category] = sub_category_pages.pageid
                    pages_df_list.append( sub_category_pages  )
                    print( 'New sub-category added to dict: {}'.format( sub_category))
                

            except: 
                
                #except:
                print( '+++Category: {} is empty'.format(sub_category) )
                continue
            #if sub_category in categories_pageid_dict: # 
            #    print( '***Category: {} already looked at'.format( sub_category))
            #    continue
            #except: 
            #    print( '+++Category: {} is empty'.format(sub_category) )
            #    continue
            
                

    pages_df = pd.concat( pages_df_list)
    #pages_df.reset_index()
    return pages_df, categories_pageid_dict

    #return 'Empty Category'


#pages_df_list.append(  category_pages_df[~sub_category_mask].pageid.apply( grab_content ))



In [None]:
## NEW,  Added error handling for empty category

def gather_articles( category, categories_pageid_dict = {}):  #  used_subcategories = [] 
    '''Collect all articles that belong to a category, including articles found in sub-categories.
        This process is performed recursively to find all relevant articles in the domain space'''
    
    category_pages_df = request_category( category)  ## load dataframe of pages contained in category
    
    sub_category_mask = category_pages_df.title.str.contains('Category:')  ## row mask for only sub-categories

    pages_df_list = []
    pages_df = category_pages_df[ ~sub_category_mask]
    pages_df_list.append( pages_df)

    sub_categories = category_pages_df[ sub_category_mask].title.str.replace( 'Category:', '').tolist()    ## Create list of all sub-category names ( Category: <name>), remove the preface                
    categories_pageid_dict[ category] = pages_df.pageid
    #print( 'added category to dict - {}'.format(category))
    n_sub_categories = sum( sub_category_mask)  ## Number of sub-categories belong to category

    if n_sub_categories > 0:
        for sub_category in sub_categories:  ## recursively, one at a time
            #if sub_category not in categories_pageid_dict:  ## skip categories already visited
            #try:
                #print('New sub-category: {}'.format( sub_category) )
            if sub_category in categories_pageid_dict:
                print( '***Category: {} already looked at'.format( sub_category))
                continue

                #print('done')
                
                
            #print('new subcat: {}'.format(sub_category not in categories_pageid_dict.keys()) )
            #except:
            if sub_category not in categories_pageid_dict:
                try:
                    print('new sub_category: {}'.format(sub_category) )
                    print( 'already collected subcategories: {}'.format( categories_pageid_dict.keys() ) )
                    store = gather_articles( sub_category, categories_pageid_dict)
                    sub_category_pages = store[0] #, count + 1 )[0]
                    categories_pageid_dict = store[1] #, count + 1 )[0]
                    categories_pageid_dict[ sub_category] = sub_category_pages.pageid
                    pages_df_list.append( sub_category_pages  )
                    print( 'New sub-category added to dict: {}'.format( sub_category))

                except Exception as N:
                    print( 'or came here due to {}'.format( N))
                    print( '+++Category: {} is empty'.format(sub_category) )
                    continue
            #if sub_category in categories_pageid_dict: # 
            #    print( '***Category: {} already looked at'.format( sub_category))
            #    continue
            #except: 
            #    print( '+++Category: {} is empty'.format(sub_category) )
            #    continue
            
                

    pages_df = pd.concat( pages_df_list)
    #pages_df.reset_index()
    return pages_df, categories_pageid_dict

    #return 'Empty Category'


#pages_df_list.append(  category_pages_df[~sub_category_mask].pageid.apply( grab_content ))



In [None]:
## NEWEST

## NEW,  Added error handling for empty category

def gather_articles( category, categories_pageid_dict = {}):  #  used_subcategories = [] 
    '''Collect all articles that belong to a category, including articles found in sub-categories.
        This process is performed recursively to find all relevant articles in the domain space'''
    
    category_pages_df = request_category( category)  ## load dataframe of pages contained in category
    
    sub_category_mask = category_pages_df.title.str.contains('Category:')  ## row mask for only sub-categories

    pages_df_list = []
    pages_df = category_pages_df[ ~sub_category_mask]
    pages_df_list.append( pages_df)
    if category in categories_pageid_dict:
        print( '***Category: {} already looked at'.format( sub_category))
        continue
    if category not in categories_pageid_dict:
        categories_pageid_dict[ category] = pages_df.pageid
        
    
    sub_categories = category_pages_df[ sub_category_mask].title.str.replace( 'Category:', '').tolist()    ## Create list of all sub-category names ( Category: <name>), remove the preface                
    categories_pageid_dict[ category] = pages_df.pageid
    #print( 'added category to dict - {}'.format(category))
    n_sub_categories = sum( sub_category_mask)  ## Number of sub-categories belong to category

    if n_sub_categories > 0:
        for sub_category in sub_categories:  ## recursively, one at a time
            try:
                sub_categoriy_pages, categories_pageid_dict = gather_articles( sub_category, categories_pageid_dict)
                
                categories_pageid_dict[ sub_category] = sub_category_pages.pageid    
            
            if sub_category not in categories_pageid_dict:  ## skip categories already visited
            #try:
                #print('New sub-category: {}'.format( sub_category) )
            if sub_category in categories_pageid_dict:
                print( '***Category: {} already looked at'.format( sub_category))
                continue

                #print('done')
                
                
            #print('new subcat: {}'.format(sub_category not in categories_pageid_dict.keys()) )
            #except:
            if sub_category not in categories_pageid_dict:
                try:
                    print('new sub_category: {}'.format(sub_category) )
                    print( 'already collected subcategories: {}'.format( categories_pageid_dict.keys() ) )
                    store = gather_articles( sub_category, categories_pageid_dict)
                    sub_category_pages = store[0] #, count + 1 )[0]
                    categories_pageid_dict = store[1] #, count + 1 )[0]
                    categories_pageid_dict[ sub_category] = sub_category_pages.pageid
                    pages_df_list.append( sub_category_pages  )
                    print( 'New sub-category added to dict: {}'.format( sub_category))

                except Exception as N:
                    print( 'or came here due to {}'.format( N))
                    print( '+++Category: {} is empty'.format(sub_category) )
                    continue
            #if sub_category in categories_pageid_dict: # 
            #    print( '***Category: {} already looked at'.format( sub_category))
            #    continue
            #except: 
            #    print( '+++Category: {} is empty'.format(sub_category) )
            #    continue
            
                

    pages_df = pd.concat( pages_df_list)
    #pages_df.reset_index()
    return pages_df, categories_pageid_dict

    #return 'Empty Category'


#pages_df_list.append(  category_pages_df[~sub_category_mask].pageid.apply( grab_content ))



In [None]:
def grab_content( page_id):
    try:
        page_content = wikipedia.WikipediaPage(pageid = page_id).content
    except: 
        page_content = ''
    return page_content
    

In [8]:
## Original

ml_articles, ml_categories_pageid_dict = gather_articles( 'machine learning')

machine learning
Applied machine learning
Artificial neural networks
Deep learning
Neural network software
Bayesian networks
Classification algorithms
Artificial neural networks
Deep learning
Neural network software
Decision trees
Ensemble learning
Cluster analysis
Cluster analysis algorithms
Clustering criteria
Computational learning theory
Artificial intelligence conferences
Data mining and machine learning software
Social network analysis software
Datasets in machine learning
Datasets in computer vision
Dimension reduction
Factor analysis
Ensemble learning
Evolutionary algorithms
Gene expression programming
Genetic algorithms
Artificial immune systems
Gene expression programming
Genetic programming
Nature-inspired metaheuristics
Genetic programming
Inductive logic programming
Kernel methods for machine learning
Support vector machines
Latent variable models
Factor analysis
Structural equation models
Learning in computer vision
Log-linear models
Loss functions
Machine learning algori

In [11]:
len( ml_articles)

1606

In [10]:
len(ml_categories_pageid_dict.keys())

31

In [None]:
ml_categories_pageid_dict

In [None]:
## New

ml_articles1, ml_categories_pageid_dict1 = gather_articles( 'machine learning')

In [None]:
ml_articles.head()

In [None]:
bs_articles, bs_categories_pageid_dict = gather_articles( 'Business_software')

In [None]:
len( categories_pageid_dict.keys())

In [None]:
categories_pageid_dict['Latent variable models']

In [None]:
ml_articles.columns

In [None]:
ml_articles.shape

In [None]:
ml_articles

In [None]:
ml_articles.shape

In [None]:
ml_articles.reset_index(inplace = True)
ml_articles


ml_articles = ml_articles.drop_duplicates().reset_index(drop = True)
ml_articles.loc[:, 'content'] = ml_articles.pageid.apply( grab_content)

In [None]:
ml_articles.content

In [None]:
pageids = ml_articles.pageid

categories_pageid_dict

pageid_categories = {} 

for pid in pageids:
    pageid_categories[ pid] 
    

In [None]:
ml_articles.head()

In [None]:
ml_articles.loc[:, 'category'] = 'machine learning'

In [None]:
## ml_articles, categories_pageid_dict = gather_articles( 'machine learning')

In [None]:
bs_pages = request_category( 'business software')

sub_category_mask = bs_pages.title.str.contains('Category:')

bs_pages[ sub_category_mask]

In [None]:
bs_pages_sub1 = request_category( 'Administrative software')

bs_pages_sub1.head()
sub_category_mask = bs_pages_sub1.title.str.contains('Category:')

print( bs_pages_sub1[ sub_category_mask])
print( bs_pages_sub1[~sub_category_mask])

In [None]:
category = 'Club software'

bs_pages_sub1 = request_category( category)

In [None]:
bs_pages_sub1.empty

In [None]:
bs_pages_sub2 = request_category( 'Business simulation games')

bs_pages_sub2.head()



In [None]:
sub_category_mask = category_pages_df.title.str.contains('Category:')  ## row mask for only sub-categories
print(category)
pages_df_list = []
categories_pageid_dict = {}

pages_df = category_pages_df[ ~sub_category_mask]
pages_df_list.append( pages_df)

sub_categories = category_pages_df[ sub_category_mask].title.str.replace( 'Category:', '').tolist()    ## Create list of all sub-category names ( Category: <name>), remove the preface                
categories_pageid_dict[ category] = pages_df.pageid
n_sub_categories = sum( sub_category_mask)  ## Number of sub-categories belong to category


In [None]:
food_articles, food_categories_pageid_dict = gather_articles( 'Foods')

In [None]:
sub_category_mask = category_pages_df.title.str.contains( 'Category:')
category_pages_df[ sub_category_mask].title.replace( 'Category:', '').tolist()

len( category_pages_df[ ~sub_category_mask].pageid.tolist()), len(category_pages_df[ ~sub_category_mask].pageid.unique() )





In [None]:
def request_category( category, sub_category = False):  ## return
    '''Request from wikipedia API for category pages (articles, subcategories) 
            return:  JSON object with category pages  '''
    if sub_category:
        cat_tag = '&cmtitle=Category:' + sub_category ## append category to cat_tag
    else: 
        cat_tag = '&cmtitle=Category:' + category ## append category to cat_tag
    
    base_url = 'https://en.wikipedia.org/w/api.php'
    action_tag = "?action=query&list=categorymembers&cmlimit=max" ## fetch all category members (pages, subcategories)
    #category_tag =  cat_tag#'&cmtitle=Category:' + category ## append category to cat_tag
    parameters_tag = "&format=json&prop=info|categories|links" #&prop=categories|links|info" ## return in json format
    request_call = base_url + action_tag + cat_tag + parameters_tag ## concatenate base_url with request tags
    request = requests.get( request_call)  ## request HTTP results
    
    category_pages = request.json()['query']['categorymembers']  ## list object containing category pages ( articles, sub-categories)
    
    pages = {}
    for i, page_info in enumerate( category_pages):
        page_id = page_info['pageid']
        page_title = page_info['title']
        if sub_category:
            pages[i] ={'category': category, 'sub-category': sub_category, 'pageid': page_id, 'title': page_title, 'content':np.nan } 
        else:
            pages[i] ={'category': category, 'sub-category': None, 'pageid': page_id, 'title': page_title, 'content':np.nan } 
    
    pages_df = pd.DataFrame.from_dict( pages, orient = 'index')
    
    pages_df['title']
    
    return pages_df
    
    

In [None]:
def gather_content( page_df, condition = True):
    
    category = page_df.category.unique()[0]

    sub_category_mask = page_df.title.str.contains( 'Category:')  ## row mask 
    
    new_articles_mask = ~sub_category_mask & page_df.content.isnull()
     
    sub_category_indices = page_df[sub_category_mask].index.tolist()
    ## gather all the article content for the each page in the category, excluding the sub_categories
    page_df.loc[new_articles_mask, 'content'] = page_df[ new_articles_mask ].apply( lambda x: grab_content( x.pageid), axis = 1 )
    
    n_sub_categories = sum( sub_category_mask)
    if n_sub_categories == 0:
        condition = False
        return page_df
    
    
    while condition:
        
        subCat_indice = sub_category_indices[0] ## grab the first one
        
        
        subCat = page_df.iloc[subCat_indice,:]['title'].split('Category:')[1] # subCat = 
        
        page_df = page_df.append( request_category( category, subCat ), ignore_index= True)
        
        
        page_df.drop( page_df.index[ subCat_indice], inplace = True )  ## Remove the original page
        
        gather_content( page_df)
        
        

In [None]:
page_df = request_category('Machine_learning')


In [None]:
page_df.head()

In [None]:
page_df.tail(31)

In [None]:
category_df = gather_content( page_df)

In [None]:
page_df.head()

In [None]:
page_df.category.unique()[0]

In [None]:
arr[0]

In [None]:
sub_category_mask = page_df.title.str.contains( 'Category:')

In [None]:
page_df.shape, sum( sub_category_mask), sum( ~sub_category_mask)

In [None]:
sub_category_mask = page_df_test.title.str.contains( 'Category:')

sum(~sub_category_mask), page_df_test.content.isnull().sum()

In [None]:
new_articles_mask = ~sub_category_mask & page_df_test.content.isnull()
sum(new_articles_mask)

In [None]:
sub_category_mask = page_df.title.str.contains( 'Category:')

new_articles_mask = ~sub_category_mask & page_df.content.isnull()
    
n_articles = sum( ~sub_category_mask)
n_sub_categories = sum( sub_category_mask)

#print( '{} requests to be made'.format( n_articles * n_sub_categories) )
## First gather all the article content for the each page in the category
page_df.loc[~sub_category_mask, 'content'] = page_df[ ~sub_category_mask ].apply( lambda x: grab_content( x.pageid), axis = 1 )
    

In [None]:
page_df.tail(31)

In [None]:
category = page_df.category.unique()[0]
category

In [None]:
sub_category_indices = page_df[sub_category_mask].index.tolist()



subCat_indice = sub_category_indices[0] ## grab the first one
print(subCat_indice)

In [None]:
subCat = page_df.iloc[subCat_indice,:]['title'].split('Category:')[1] # subCat = 
print( subCat)

In [None]:
sub_category_pages_df = request_category( category, subCat )

page_df = page_df.append( sub_category_pages_df, ignore_index= True)



In [None]:
page_df.drop( page_df.index[262:], inplace = True)

In [None]:
page_df.tail(65)

In [None]:
print( page_df.loc[ subCat_indice, :])


page_df.drop( page_df.index[ subCat_indice], inplace = True )  ## Remove the original page
page_df.tail(65)

In [None]:
sub_category_mask = page_df.title.str.contains( 'Category:')  ## row mask 

In [None]:
sum(sub_category_mask)

In [None]:

new_articles_mask = ~sub_category_mask & page_df.content.isnull()

In [None]:
page_df[new_articles_mask]

In [None]:
page_df.content.isnull()

In [None]:
category = page_df.category.unique()[0]

sub_category_indices = df[sub_category_mask].index.tolist()

subCat_indice = sub_category_indices[0] ## grab the first one
print(subCat_indice)


subCat = page_df.iloc[subCat_indice,:]['title'].split('Category:')[1] # subCat = 
print( subCat)
request_category( category, subCat )

#page_df = page_df.append( request_category( category, subCat ), ignore_index= True) # 

#print( page_df.loc[ subCat_indice, :])

#page_df.tail()
#page_df.drop( page_df.index[ sub_Cat_indice] )  ## Remove the original page


In [None]:
page_df

In [None]:
page_df.iloc[455,:]

In [None]:
page_df[ page_df['sub-category'] == 'Applied machine learning']

In [None]:
page_df['sub-category'].isnull().sum()

In [None]:
page_df.shape

In [None]:
page_df.iloc[198,:]

In [None]:
n_sub_categories


In [None]:
for i in range( ):
    page_df.append( request_category( page_df.category.unique()[0], df[sub_category_mask].iloc[i,:]['title'].split('Category:')[1]), ignore_index= True)            
    # page_df.drop()  ## remove original page

In [None]:
sub_category_indices = df[sub_category_mask].index.tolist()

subCat_indice = sub_category_indices[0] ## grab the first one

subCat = page_df.iloc[subCat_indice,:]['title'].split('Category:')[1] # subCat = 
        
page_df.append( request_category( category, subCat ), ignore_index= True)


page_df.drop( page_df.index[ sub_Cat_indice] )  ## Remove the original page


In [None]:
grab_content( 39945557)

In [None]:
~sub_category_mask

In [None]:



df.loc[~sub_category_mask, 'content'] = df[ ~sub_category_mask ].apply( lambda x: grab_content( x.pageid), axis = 1 )
#sub_df.apply( lambda x: grab_content(x.pageid), axis = 1) # grab_content(

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df[sub_category_mask]

In [None]:
df[sub_category_mask]

In [None]:
df[sub_category_mask].apply( lambda x: request_category('Machine_learning', x.title.split('Category:')[1]), axis = 1 )#, axis = 1)
#

In [None]:
df.append(request_category( 'Machine_learning', df[sub_category_mask].apply( lambda x: x.title.split('Category:')[1]), ignore_index= True )

In [None]:
df.append(request_category( 'Machine_learning', df[sub_category_mask].iloc[0,:]['title'].split('Category:')[1]), ignore_index= True)            

In [None]:
df[sub_category_mask].apply(  lambda x: request_category( x['title'].split('Category:')[1]), axis = 1)   # 
## 
                        
                        
                        
                        

In [None]:
df[sub_category_mask]

In [None]:
sub_category_mask = df.title.str.contains( 'Category:')


article_df = df[ ~sub_category_mask ]

In [None]:
sub_df = article_df.loc[:10,:]

In [None]:
sub_df

In [None]:
sub_df.loc[:, 'content'] = sub_df.apply( lambda x: grab_content(x.pageid), axis = 1) # grab_content(

In [None]:
sub_df.head()

In [None]:
ml_json = request_category('Machine_learning').json()

ml_pages  = ml_json['query']['categorymembers']  ## List object

In [None]:


ml_pages  = ml_json['query']['categorymembers']  ## List object
N_pages = len(ml_pages)

In [None]:
ml_pages[-1]['title'].split('Category:')[1]

In [None]:
pages = {}

In [None]:
ml_json['query'].keys()

In [None]:
pages = {}
for i, page_info in enumerate( category_pages):
    page_id = page_info['pageid']
    page_title = page_info['title']
    
    pages[i] ={'pageid': page_id, 'title': page_title, 'content': page_content, 'category': category} 
    
pages_df = pd.DataFrame.from_dict( pages, orient = 'index')

    

In [None]:
def gather_content( category, category_pages):
    
    request_category( category).json()
    

    pages = {}

    for i, page_info in enumerate( category_pages):

        page_type = page_info['ns']
        page_id = page_info['pageid']
        page_title = page_info['title']

        if page_type == 0:  ## category
            
            page_content = wikipedia.WikipediaPage(pageid = page_id).content

            pages[i] ={'pageid': page_id, 'title': page_title, 'conent': page_content, 'category':} 
            
        else:  ## sub-category
            sub_category = page_title.split('Category:')[1]
            
            sub_category_pages = request_category( sub_category).json()['query']['categorymembers']
            
            gather_content( category, sub_category_pages)
            
        
        
    

In [None]:
def gather_content( category, category_pages):
    
    

    pages = {}

    for i, page_info in enumerate( ml_pages[0:10]):

        page_type = page_info['ns']
        page_id = page_info['pageid']
        page_title = page_info['title']

        if page_type == 14:  ## sub-category
            
            sub_category = page_title.split('Category:')[1]
            
            sub_category_json = request_category( sub_category).json()
            
            
            

        #else:  ## article
            #page_content = wikipedia.WikipediaPage(pageid = page_id).content

            #pages[i] ={'pageid': page_id, 'title': page_title, 'conent': page_content, 'category':} 
        
#pages        
        
        
        
    

In [None]:
pages

In [None]:
pd.DataFrame.from_dict( pages, orient = 'index')

In [None]:
##Get all the pages in a Category

catJson = ml.json() ## dict
catJson.keys()



In [None]:
catJson

In [None]:
catPages = catJson['query']['categorymembers']
print( len( catPages))
print( catPages[0:5], sep = '\n' )

In [None]:
catPages

In [None]:
articles = {}

subcats = {}

for i, page in enumerate( catPages):
    
    pageid = page['pageid']
    title = page['title']
    
    if 'Category' not in title:

    
    

In [None]:
pd.DataFrame()

In [None]:
#wikipedia.WikipediaPage(pageid = 50222574).links
wikipedia.WikipediaPage(pageid = 39945557)

In [None]:
base_url = 'https://en.wikipedia.org/w/api.php'

In [None]:
action = "?action=query&list=categorymembers&cmlimit=max" ## &generator=allpages |allpages
#action = "?action=mobileview"
parameters = "&format=json&prop=categories|links|info" #  " # &sections=all &rvprop=content
category = '&cmtitle=Category:'
#titles = "&titles="
#page = "&page="

In [None]:
#category = "Machine_learning"

url = base_url + action + parameters + category + 'Category:Machine_learning'


In [None]:
ml = requests.get( url)

In [None]:
ml.json()