In [2]:
'''
This python script will create a pandas dataframe in the following format:
id | text (english) | title | url (en-wiki) | topic
parsing from the english wikipedia set, only the topics / id / revid / article name / url / english text 
will parse. 

JSON format: {"id": "", "url":"", "title": "", "text": "..."}
JSON format extracted using: https://github.com/attardi/wikiextractor

https://www.mediawiki.org/wiki/API:Categories for topic
'''

import json
import pandas as pd
import requests

# Currently goes through 1 json instance at a time to make 1 csv for the articles in that json file.
pandas_df = pd.read_json("wiki_00.json",lines="True")
print(pandas_df.head())



    id                                               text      title  \
0   12  Anarchism\n\nAnarchism is an anti-authoritaria...  Anarchism   
1   25  Autism\n\nAutism is a developmental disorder c...     Autism   
2   39  Albedo\n\nAlbedo () (, meaning 'whiteness') is...     Albedo   
3  290  A\n\nA (named , plural "As", "A's", "a"s, "a's...          A   
4  303  Alabama\n\nAlabama () is a state in the southe...    Alabama   

                                       url  
0   https://en.wikipedia.org/wiki?curid=12  
1   https://en.wikipedia.org/wiki?curid=25  
2   https://en.wikipedia.org/wiki?curid=39  
3  https://en.wikipedia.org/wiki?curid=290  
4  https://en.wikipedia.org/wiki?curid=303  


In [19]:
''' 
INDEXING:
id=0,text=1, title=2, url=3, topic (will be)=4
---

Next, can get its topic by: https://www.mediawiki.org/wiki/API:Categories for topic


'''
# default value is NONE for topic
pandas_df['topic']='NONE' 
pandas_df = pandas_df.astype('object')
sess = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
# Go through the URLs in the dataset: 

for index, row in pandas_df.iterrows():
    title = row['title']
    params = {
    "action": "query",
    "format": "json",
    "titles": title,
    "prop": "categories",
    }


    R = sess.get(url=URL, params=params)
    cat_dat = R.json()
#     print(index, cat_dat)
    if 'continue' in cat_dat.keys():
        number = cat_dat['continue']['clcontinue'].split("|")[0]
        # All categories associated are within this list. 
        # Just grab first one that appears. 
        # Format of categories: "categories": [{ns:xx, title:xx}, {..}, {...}]
        # Format: "title": "Category:1985 births" --> split = ["category", "1985 births"]
        # Could get all topics from just list_cat. 
        # add all categories in a list
        list_cat = cat_dat['query']['pages'][number]['categories']
        topic_cat = [cat['title'].split(':')[1] for cat in list_cat]

        # just keep the first category
        #         topic_cat = list_cat['title'].split(":")[1]
        print(topic_cat)

        # Change 'topic' to be the topic_cat
        row['topic'] = topic_cat
        pandas_df.loc[index, 'topic']= topic_cat

print(pandas_df.head())



['All articles lacking reliable references', 'All articles with unsourced statements', 'Anarchism', 'Anti-capitalism', 'Anti-fascism', 'Articles containing Spanish-language text', 'Articles lacking reliable references from March 2019', 'Articles prone to spam from November 2014', 'Articles with short description', 'Articles with unsourced statements from March 2019']
['All Wikipedia articles written in American English', 'All articles containing potentially dated statements', 'Articles containing potentially dated statements from 2015', 'Articles containing potentially dated statements from 2017', 'Articles containing video clips', 'Articles with Curlie links', 'Articles with short description', 'Autism', 'CS1 German-language sources (de)', 'CS1 Spanish-language sources (es)']
['All articles with unsourced statements', 'Articles containing Latin-language text', 'Articles with unsourced statements from December 2017', 'Articles with unsourced statements from January 2018', 'Articles wit

In [16]:
# Finally, place into csv for use in other people's code! (If you are using this dataset in other analysis)

pandas_df.to_csv(path_or_buf='wikipandas.csv', index=False)


# To see how the output can be used again, un-comment these two lines:
#new_pandas = pd.read_csv('wikipandas.csv')
#print(new_pandas.head())