In [1]:
import pandas as pd # for data manipulation
import requests     # for executing the HTTP request
import json         # for data manipulation in JSON format

from pprint import pprint          # display data nicely
from IPython.display import JSON   # display JSON data nicely in some environments

## 1. Implement your own wrapper function to parse a page

Your function should
1. receive the page title as a parameter
2. allow to specify a language but default to 'en'

In [2]:
def parseWikipedia(page, lang='en'):
    # construct the API call
    url = f'https://{lang}.wikipedia.org/w/api.php?action=parse&page={page}&format=json'
    r = requests.get(url)
    data = json.loads(r.text)

    return data

## 2. Test it with any Wikipedia entry you like

In [3]:
page = 'Python_(programming_language)'
data_en = parseWikipedia(page)

In [4]:
# print the result
pprint(data_en)

{'parse': {'categories': [{'*': 'Module:Wd_reference_errors',
                           'hidden': '',
                           'sortkey': ''},
                          {'*': 'Webarchive_template_wayback_links',
                           'hidden': '',
                           'sortkey': ''},
                          {'*': 'Wikipedia_semi-protected_pages',
                           'hidden': '',
                           'sortkey': 'Python (programming language)'},
                          {'*': 'Articles_with_short_description',
                           'hidden': '',
                           'sortkey': ''},
                          {'*': 'Short_description_matches_Wikidata',
                           'hidden': '',
                           'sortkey': ''},
                          {'*': 'Use_dmy_dates_from_November_2021',
                           'hidden': '',
                           'sortkey': ''},
                          {'*': 'Articles_containing_potentially_

           'text': {'*': '<div class="mw-parser-output"><p '
                         'class="mw-empty-elt">\n'
                         '</p>\n'
                         '<div class="shortdescription nomobile noexcerpt '
                         'noprint searchaux" '
                         'style="display:none">General-purpose programming '
                         'language</div>\n'
                         '<p class="mw-empty-elt">\n'
                         '</p>\n'
                         '<style '
                         'data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output '
                         '.hlist dl,.mw-parser-output .hlist '
                         'ol,.mw-parser-output .hlist '
                         'ul{margin:0;padding:0}.mw-parser-output .hlist '
                         'dd,.mw-parser-output .hlist dt,.mw-parser-output '
                         '.hlist li{margin:0;display:inline}.mw-parser-output '
                         '.hlist.inline,.

## 3. Get the number of languages of this entry

In [5]:
print('Number of languages:', len(data_en['parse']['langlinks']))

Number of languages: 107


In [6]:
data_en['parse']['langlinks']

[{'lang': 'af',
  'url': 'https://af.wikipedia.org/wiki/Python_(programmeertaal)',
  'langname': 'Afrikaans',
  'autonym': 'Afrikaans',
  '*': 'Python (programmeertaal)'},
 {'lang': 'als',
  'url': 'https://als.wikipedia.org/wiki/Python_(Programmiersprache)',
  'langname': 'Alemannic',
  'autonym': 'Alemannisch',
  '*': 'Python (Programmiersprache)'},
 {'lang': 'ar',
  'url': 'https://ar.wikipedia.org/wiki/%D8%A8%D8%A7%D9%8A%D8%AB%D9%88%D9%86_(%D9%84%D8%BA%D8%A9_%D8%A8%D8%B1%D9%85%D8%AC%D8%A9)',
  'langname': 'Arabic',
  'autonym': 'العربية',
  '*': 'بايثون (لغة برمجة)'},
 {'lang': 'an',
  'url': 'https://an.wikipedia.org/wiki/Python',
  'langname': 'Aragonese',
  'autonym': 'aragonés',
  '*': 'Python'},
 {'lang': 'as',
  'url': 'https://as.wikipedia.org/wiki/%E0%A6%AA%E0%A6%BE%E0%A6%87%E0%A6%A5%E0%A6%A8',
  'langname': 'Assamese',
  'autonym': 'অসমীয়া',
  '*': 'পাইথন'},
 {'lang': 'ast',
  'url': 'https://ast.wikipedia.org/wiki/Python',
  'langname': 'Asturian',
  'autonym': 'asturian

## 4. Find the number of top-level sections in the page of each language

Hint:
1. check ```data['parse']['sections']```
2. ```toclevel == 1``` means it is a top-level section

In [7]:
for langlink in data_en['parse']['langlinks']:
    # get data from API
    data = parseWikipedia(langlink['*'], langlink['lang'])

    # calculate the number of top-level sections
    num_top_sections =  sum([ 1 if section['toclevel'] == 1 else 0 for section in data['parse']['sections'] ])

    # print the result nicely
    print(f"Language: {langlink['langname'].ljust(16)} \t \t Number of top-level sections: {num_top_sections}")

Language: Afrikaans        	 	 Number of top-level sections: 6
Language: Alemannic        	 	 Number of top-level sections: 2
Language: Arabic           	 	 Number of top-level sections: 18
Language: Aragonese        	 	 Number of top-level sections: 4
Language: Assamese         	 	 Number of top-level sections: 2
Language: Asturian         	 	 Number of top-level sections: 14
Language: Azerbaijani      	 	 Number of top-level sections: 4
Language: South Azerbaijani 	 	 Number of top-level sections: 2
Language: Balinese         	 	 Number of top-level sections: 1
Language: Bangla           	 	 Number of top-level sections: 10
Language: Min Nan Chinese  	 	 Number of top-level sections: 3
Language: Belarusian       	 	 Number of top-level sections: 5
Language: Bhojpuri         	 	 Number of top-level sections: 3
Language: Bulgarian        	 	 Number of top-level sections: 11
Language: Bosnian          	 	 Number of top-level sections: 5
Language: Breton           	 	 Number of top-level

In [8]:
data_en['parse']['sections']

[{'toclevel': 1,
  'level': '2',
  'line': 'History',
  'number': '1',
  'index': '1',
  'fromtitle': 'Python_(programming_language)',
  'byteoffset': 15048,
  'anchor': 'History',
  'linkAnchor': 'History'},
 {'toclevel': 1,
  'level': '2',
  'line': 'Design philosophy and features',
  'number': '2',
  'index': '2',
  'fromtitle': 'Python_(programming_language)',
  'byteoffset': 24569,
  'anchor': 'Design_philosophy_and_features',
  'linkAnchor': 'Design_philosophy_and_features'},
 {'toclevel': 1,
  'level': '2',
  'line': 'Syntax and semantics',
  'number': '3',
  'index': '3',
  'fromtitle': 'Python_(programming_language)',
  'byteoffset': 31354,
  'anchor': 'Syntax_and_semantics',
  'linkAnchor': 'Syntax_and_semantics'},
 {'toclevel': 2,
  'level': '3',
  'line': 'Indentation',
  'number': '3.1',
  'index': '4',
  'fromtitle': 'Python_(programming_language)',
  'byteoffset': 31915,
  'anchor': 'Indentation',
  'linkAnchor': 'Indentation'},
 {'toclevel': 2,
  'level': '3',
  'line':