# Get data from Wikidata and Wikipedia

In [1]:
import sys
sys.path.append("../../")

import bechdelai.data.wikipedia as wiki
import outputformat as ouf
import importlib
importlib.reload(wiki)



<module 'bechdelai.data.wikipedia' from '../../bechdelai/data/wikipedia.py'>

# Get sections from specific Wikipedia page

It is a good approach to always use '(release_year film)' after the movie title. It helps redirecting to the right page.

In [2]:
query = "The Batman (2022 film)"
sections = wiki.get_section_text(query,['Plot','Cast'],verbose=True)

for key,val in sections.items():
    ouf.boxtitle(key, style="line")
    print(val)
# print(sections)

 Page sections
╭─────────────
├ Plot.........................: 1
├ Cast.........................: 2
├ Production...................: 3
├ Development..................: 4
├ Ben_Affleck..................: 5
├ Matt_Reeves..................: 6
├ Writing......................: 7
├ Casting......................: 8
├ Design.......................: 9
├ Sets_and_props...............: 10
├ Costumes.....................: 11
├ Filming......................: 12
├ COVID-19_pandemic............: 13
├ Editing......................: 14
├ Visual_effects...............: 15
├ Music........................: 16
├ Marketing....................: 17
├ Release......................: 18
├ Theatrical...................: 19
├ Home_media...................: 20
├ Reception....................: 21
├ Box_office...................: 22
├ Critical_response............: 23
├ Accolades....................: 24
├ Thematic_analysis............: 25
├ Class_conflict_and_inequality: 26
├ Depiction_of_Batman..........: 27
├ Tie-i

# Ambiguous query
If the query is ambiguous, the request is directed to a disambiguation page

In [3]:
query = "The Batman"
sections = wiki.get_section_text(query,['Plot','Cast'],verbose=True)

query_options = wiki.get_links(query)
ouf.showlist(query_options,title="Disambiguation pages", style="line")

 Page sections
╭─────────────
├ Art,_entertainment,_and_media: 1
├ Characters...................: 2
├ Print_media..................: 3
├ Films........................: 4
├ Television...................: 5
├ Video_games..................: 6
├ Music........................: 7
├ Toy_line.....................: 8
├ Places.......................: 9
├ Australia....................: 10
├ Iran.........................: 11
├ Turkey.......................: 12
├ People.......................: 13
├ As_nickname_or_stage_name....: 14
├ Other_uses...................: 15
╰ See_also.....................: 16

KeyError: Plot is not a section in the page
KeyError: Cast is not a section in the page
 Disambiguation pages
╭────────────────────
├ Antonis Fotsis
├ B.A.T.M.A.N.
├ Bathmen
├ Batman
├ Batman! (Jan and Dean song)
├ Batman's Treaty
├ Batman, Iran
├ Batman, Turkey
├ Batman: The Animated Series
├ Batman: The Brave and the Bold
├ Batman: The Enemy Within
├ Batman: The Ride
├ Batman: The Telltale Series


# Non existing pages
ValueError is raised when no page can be found with the given query

In [4]:
query = "pjfpk,c"
sections = wiki.get_section_text(query,['Plot','Cast'],verbose=True)

ValueError: This query does not correspond to a Wikipedia page.

In [5]:
query_options = wiki.get_links(query)

ValueError: This query does not correspond to a Wikipedia page.

# Get Wikidata
1. get QID
2. retrieve json
3. get properties of interest

In [6]:
# In some cases it might be necessary to choose among the QID related to the query
query = "Catwoman"
qids = wiki.get_qid_from_query(query,language="en",verbose=True)


Catwoman (Q158952): fictional character associated with DC Comics' Batman franchise
Catwoman (Q115760): 2004 film directed by Pitof
Catwoman (Q5054224): 2004 video game
Catwoman (Q342478): Wikimedia disambiguation page
Holly Robinson (Q5882236): fictional character in DC universe
Patience Phillips (Q52199465): main protagonist of the 2004 film Catwoman
Catwoman (Q70991896): Anti-heroine in the film Batman Returns


In [7]:
# It is possible to get the wikidata directly from the query. It corresponds to using the first of the related QIDs
data = wiki.get_json_from_qid(qids[0])
data = wiki.get_json_from_query(query)
data

{'entities': {'Q158952': {'pageid': 159911,
   'ns': 0,
   'title': 'Q158952',
   'lastrevid': 1613486296,
   'modified': '2022-04-07T16:38:06Z',
   'type': 'item',
   'id': 'Q158952',
   'labels': {'zh-hans': {'language': 'zh-hans', 'value': '猫女'},
    'zh-hant': {'language': 'zh-hant', 'value': '貓女'},
    'zh-hk': {'language': 'zh-hk', 'value': '貓女'},
    'zh-cn': {'language': 'zh-cn', 'value': '猫女'},
    'zh-sg': {'language': 'zh-sg', 'value': '猫女'},
    'zh-tw': {'language': 'zh-tw', 'value': '貓女'},
    'pl': {'language': 'pl', 'value': 'Catwoman'},
    'fr': {'language': 'fr', 'value': 'Catwoman'},
    'he': {'language': 'he', 'value': 'קאטוומן'},
    'ko': {'language': 'ko', 'value': '캣우먼'},
    'es': {'language': 'es', 'value': 'Catwoman'},
    'ta': {'language': 'ta', 'value': 'கேட்வுமன்'},
    'hu': {'language': 'hu', 'value': 'Macskanő'},
    'it': {'language': 'it', 'value': 'Catwoman'},
    'de': {'language': 'de', 'value': 'Catwoman'},
    'ja': {'language': 'ja', 'value':

In [8]:
keys = list(data['entities'].keys())
data['entities'][keys[0]]['claims']

{'P373': [{'mainsnak': {'snaktype': 'value',
    'property': 'P373',
    'hash': '59c134c615ad9a5777bb19947f294cd32a2608ab',
    'datavalue': {'value': 'Catwoman', 'type': 'string'},
    'datatype': 'string'},
   'type': 'statement',
   'id': 'Q158952$a2448213-4537-0872-8f3e-fad85bca2109',
   'rank': 'normal'}],
 'P345': [{'mainsnak': {'snaktype': 'value',
    'property': 'P345',
    'hash': 'f23a044ebdbf18957388ceb2ef3b388afe33de77',
    'datavalue': {'value': 'ch0000184', 'type': 'string'},
    'datatype': 'external-id'},
   'type': 'statement',
   'qualifiers': {'P2241': [{'snaktype': 'value',
      'property': 'P2241',
      'hash': 'b5a95dd10e545d7e57bfcc86e5f3290b6c1231d3',
      'datavalue': {'value': {'entity-type': 'item',
        'numeric-id': 44374960,
        'id': 'Q44374960'},
       'type': 'wikibase-entityid'},
      'datatype': 'wikibase-item'}]},
   'qualifiers-order': ['P2241'],
   'id': 'Q158952$f1a5ce26-40c4-d342-c11f-8e36ce074bc9',
   'rank': 'deprecated'}],
 'P31

Examples of properties of interest :
- P21: sex or gender
- P27: contry of citizenship
- P19: place of brith
- P103: native language
- P1412: languages spoken, written or signed
- P106: occupation
- P345: IMDb ID

In [10]:
properties = ['P21','P27','P19','P103','P1412','P106','P345']
wiki.dataframe_from_json(data,properties)

Unnamed: 0,property,value
0,sex or gender,[female]
1,country of citizenship,[United States of America]
2,place of birth,[Gotham City]
3,native language,[American English]
4,"languages spoken, written or signed",[American English]
5,occupation,"[supervillain, thief, superhero, fictional vig..."
6,IMDb ID,[ch0000184]


In [11]:
wiki.dataframe_from_json(data,properties,language="fr")

Unnamed: 0,property,value
0,sexe ou genre,[féminin]
1,pays de citoyenneté,[États-Unis]
2,lieu de naissance,[Gotham City]
3,langue maternelle,[anglais américain]
4,"langues parlées, écrites ou signées",[anglais américain]
5,occupation,"[super-vilain, voleur, super-héros, justicier ..."
6,identifiant Internet Movie Database,[ch0000184]
