In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval as leval
import seaborn as sns
from tqdm.auto import tqdm
tqdm.pandas()
from matplotlib import pyplot as plt

In [2]:
reference_text_df = pd.read_csv('text_extraction/reference_html_as_sentences_df.csv')

claim_data_df = pd.read_csv('text_extraction/text_reference_claims_df.csv')

In [3]:
reference_text_df.info()
reference_text_df_trim = reference_text_df.drop(
    ['error_msg','code','content-type','reason','language_crawl','language_crawl_score',
    'sampling_weight_vb', 'sampling_weight', 'extracted_sentences', 'extracted_text', 'nlp_sentences',
    'nlp_sentences_slide_2'], axis=1
)
reference_text_df_trim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676 entries, 0 to 675
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   reference_id           676 non-null    object 
 1   reference_property_id  676 non-null    object 
 2   reference_datatype     676 non-null    object 
 3   url                    676 non-null    object 
 4   netloc                 676 non-null    object 
 5   netloc_agg             676 non-null    object 
 6   error_msg              676 non-null    object 
 7   code                   676 non-null    int64  
 8   content-type           676 non-null    object 
 9   final_url              676 non-null    object 
 10  reason                 662 non-null    object 
 11  language_crawl         676 non-null    object 
 12  language_crawl_score   676 non-null    float64
 13  sampling_weight_vb     676 non-null    object 
 14  sampling_weight        676 non-null    float64
 15  html  

In [4]:
# Correct some final_urls in the reference_text_df dataframe before joining
import re

# Replace by archived page if page was behing paywall when parsed
reference_text_df_trim.loc[reference_text_df_trim.html.map(lambda x : '://archive.ph/' in x), 'final_url'] =\
    reference_text_df_trim.loc[reference_text_df_trim.html.map(lambda x : '://archive.ph/' in x)]\
        .html.map(lambda x : re.findall(r'http(?:s){0,1}://archive.ph/(?:[a-zA-Z0-9]*)', x)[0])

In [5]:
claim_data_df.info()
claim_data_df_trim = claim_data_df[
    [
        'reference_id','claim_id','rank','datatype','datavalue',
        'entity_id','property_id',
        'entity_label','property_label','object_label',
        'entity_alias','property_alias','object_alias',
        'entity_desc','property_desc','object_desc'
    ]
].copy()

claim_data_df_trim['verb_mock'] = claim_data_df_trim.apply(
    lambda row: '$'.join([row['entity_label'], row['property_label'], row['object_label']]), axis=1
)

claim_data_df_trim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1572 entries, 0 to 1571
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   reference_id        1572 non-null   object
 1   entity_id           1572 non-null   object
 2   claim_id            1572 non-null   object
 3   rank                1572 non-null   object
 4   property_id         1572 non-null   object
 5   datatype            1572 non-null   object
 6   datavalue           1572 non-null   object
 7   entity_label        1572 non-null   object
 8   entity_label_lan    1572 non-null   object
 9   property_label      1572 non-null   object
 10  property_label_lan  1572 non-null   object
 11  entity_alias        1572 non-null   object
 12  entity_alias_lan    1572 non-null   object
 13  property_alias      1572 non-null   object
 14  property_alias_lan  1572 non-null   object
 15  entity_desc         1572 non-null   object
 16  entity_desc_lan     1572

In [6]:
# REMOVE P1448 (OFFICIAL NAME), P1476 (TITLE), AND P1889 (DIFFERENT) AS THEY ARE REDUNDANT AND NON-INFORMATIVE
#also look at the dataset creation for other properties that were deleted and delete them too

BAD_PROPERTIES = [
    'P1448', # offical name
    'P1476', # title
    'P1889',# different
    'P31', # - instance of
    'P279',# - subclass of
    'P373',# - commons category
    'P910',# - Topic's main category
    'P7561',# - category for the interior of the item
    'P5008',# - on focus list of Wikimedia project
    'P2670',# -  has parts of the class
    'P1740',# -  category for films shot at this location
    'P1612',# -  Commons Institution page
    'P8989',# -  category for the view of the item
    'P2959',# -  permanent duplicated item
    'P7867',# -  category for maps
    'P935' ,# -  Commons gallery
    'P1472',#  -  Commons Creator page
    'P8596',# category for the exterior of the item
    'P5105',# Deutsche Bahn station category
    'P8933',# category for the view from the item
    'P642',# of
    'P3876',# category for alumni of educational institution
    'P1791',# category of people buried here
    'P7084',# related category
    'P1465',# category for people who died here
    'P1687',# Wikidata property
    'P6104',# maintained by WikiProject
    'P4195',# category for employees of the organization
    'P1792',# category of associated people
    'P5869',# model item
    'P1659',# see also
    'P1464',# category for people born here
    'P2354',# has list
    'P1424',# topic's main template
    'P7782',# category for ship name
    'P179',# part of the series
    'P7888',# merged into
    'P6365',# member category
    'P8464',# content partnership category
    'P360',# is a list of
    'P805',# statement is subject of
    'P8703',# entry in abbreviations table
    'P1456',# list of monuments
    'P1012',# including
    'P1151',# topic's main Wikimedia portal
    'P2490',# page at OSTIS Belarus Wiki
    'P593',# HomoloGene ID
    'P8744',# economy of topic
    'P2614',# World Heritage criteria
    'P2184',# history of topic
    'P9241',# demographics of topic
    'P487',#Unicode character
    'P1754',#category related to list
    'P2559',#Wikidata usage instructions
    'P2517',#category for recipients of this award
    'P971',#category combines topics
    'P6112',# category for members of a team
    'P4224',#category contains
    'P301',#category's main topic
    'P1753',#list related to category
    'P1423',#template has topic
    'P1204',#Wikimedia portal's main topic
    'P3921',#Wikidata SPARQL query equivalent
    'P1963',#properties for this type
    'P5125',#Wikimedia outline
    'P3176',#uses property
    'P8952',#inappropriate property for this type
    'P2306',#property
    'P5193',#Wikidata property example for forms
    'P5977',#Wikidata property example for senses
    'P1748',#NCI Thesaurus ID
    'P1692',#ICD-9-CM
    'P248',#stated in
]

claim_data_df_trim = claim_data_df_trim.drop(473, axis='index') #No english object label or aliases
claim_data_df_trim = claim_data_df_trim.drop(593, axis='index') #No english object label or aliases
claim_data_df_trim = claim_data_df_trim.reset_index(drop=True)

claim_data_df_trim_badpropdrop = claim_data_df_trim[
    ~claim_data_df_trim['property_id'].isin(BAD_PROPERTIES)
]
print('Percentage [Number] of claims dropped due to bad properties')
print(
    f'{100 - 100*claim_data_df_trim_badpropdrop.shape[0]/claim_data_df_trim.shape[0]}',
    f'[{claim_data_df_trim.shape[0] - claim_data_df_trim_badpropdrop.shape[0]}]'
)

Percentage [Number] of claims dropped due to bad properties
15.031847133757964 [236]


In [7]:
# Get URLs from the references df
wtr = pd.merge(
    reference_text_df_trim,
    claim_data_df_trim_badpropdrop,
    on='reference_id'
)
wtr

Unnamed: 0,reference_id,reference_property_id,reference_datatype,url,netloc,netloc_agg,final_url,html,claim_id,rank,...,entity_label,property_label,object_label,entity_alias,property_alias,object_alias,entity_desc,property_desc,object_desc,verb_mock
0,390d6c6e68a32e11f8d7b0883cda0557db529fe6,P4656,url,https://en.wikipedia.org/w/index.php?title=G._...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=G._...,"<html class=""client-js ve-available"" lang=""en""...",Q5512528$81E8AD02-28AF-4AE3-8ACD-047C30B40B01,normal,...,G. V. Raja,place of death,Kullu Valley,"['G.V. Raja', 'ജി.വി. രാജ']","['deathplace', 'died in', 'death place', 'POD'...","['Kulû', 'Vallee de Kulu', 'Vallée de kulu']",sports official,most specific known (e.g. city instead of coun...,"valley in Himachal Pradesh, India",G. V. Raja$place of death$Kullu Valley
1,49b9aec8e10815611ff0379a34d5fd7c3830566e,P4656,url,https://en.wikipedia.org/w/index.php?title=Seb...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Seb...,"<html class=""client-js ve-available"" lang=""en""...",Q12149940$C9FE5F0C-78FD-4ECE-B1C2-16A763B8ED4E,normal,...,Sebastian Sabol,religion,Greek catholic church,['Sebastian Stepan Sabol'],"['religious affiliation', 'faith', 'life stanc...","['Iglesia Católica Griega', 'Iglesia catolica ...","Ukrainian Basilian priest, poet and writer (19...","religion of a person, organization or religiou...",group of Eastern Catholic Churches following t...,Sebastian Sabol$religion$Greek catholic church
2,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,P4656,url,https://en.wikipedia.org/w/index.php?title=Ave...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Ave...,"<html class=""client-js ve-available"" lang=""en""...",Q583556$929DCD2F-3B4A-41B5-89E7-EEC549077834,normal,...,Average White Band,location of formation,Dundee,['Average White Band'],"['originates from', 'comes from', 'place of fo...",['City of Dundee'],Scottish R&B band,location where a group or organization was formed,city in Scotland,Average White Band$location of formation$Dundee
3,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,P4656,url,https://en.wikipedia.org/w/index.php?title=Ave...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Ave...,"<html class=""client-js ve-available"" lang=""en""...",Q583556$2AD75C92-953E-47CB-93A6-D082674F4231,normal,...,Average White Band,work period (start),1972,['Average White Band'],"['floruit (start)', 'floruit start', 'flourish...",no-alias,Scottish R&B band,start of period during which a person or group...,no-desc,Average White Band$work period (start)$1972
4,76b04346ad57869d9e5ae1007ba8343d708ab6f9,P4656,url,https://en.wikipedia.org/w/index.php?title=Yut...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Yut...,"<html class=""client-js ve-available"" lang=""en""...",Q7586053$AD344012-A522-4A8A-AF64-16F859E2EA9A,normal,...,Yutaka Higuchi,work period (start),1983,['Ютака Хигути'],"['floruit (start)', 'floruit start', 'flourish...",no-alias,Japanese musician,start of period during which a person or group...,no-desc,Yutaka Higuchi$work period (start)$1983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1329,c714ca621d1fe1ca35f20e01a72587d23bdd8e1e,P854,url,https://indiancine.ma/BTW,indiancine.ma,indiancine.ma,https://indiancine.ma/BTW,"<html><head>\n <meta charset=""utf-8"">\n...",Q58485125$0B087946-63A0-4D69-BAEA-AD52DFBE2C6F,normal,...,Navjeevan,original language of film or TV show,Hindi,no-alias,"['original language', 'language of the origina...","['Hindi language', 'Modern Standard Hindi', 'hi']",1935 film,language in which a film or a performance work...,Indo-Aryan language,Navjeevan$original language of film or TV show...
1330,52968f905e88f2484e3cb43e6ab4e76f120bc3c3,P854,url,https://indiancine.ma/BTD,indiancine.ma,indiancine.ma,https://indiancine.ma/BTD,"<html><head>\n <meta charset=""utf-8"">\n...",Q58485081$AF144153-1EF3-4494-BBCC-90742E594482,normal,...,Mantra Shakti,original language of film or TV show,Bengali,no-alias,"['original language', 'language of the origina...","['Bengali language', 'Bangla', 'bn', 'ben']",1935 film,language in which a film or a performance work...,Indo-Aryan language mainly spoken in Banglades...,Mantra Shakti$original language of film or TV ...
1331,8bb93e520f7ca8208f24078434b8f5f2ba8b520b,P854,url,https://indiancine.ma/BKF,indiancine.ma,indiancine.ma,https://indiancine.ma/BKF,"<html><head>\n <meta charset=""utf-8"">\n...",Q58484448$9611AF89-1BD2-4D49-9CBB-E3EC52C7765D,normal,...,Gul Sanobar,original language of film or TV show,Hindi,no-alias,"['original language', 'language of the origina...","['Hindi language', 'Modern Standard Hindi', 'hi']",1934 film,language in which a film or a performance work...,Indo-Aryan language,Gul Sanobar$original language of film or TV sh...
1332,8bb93e520f7ca8208f24078434b8f5f2ba8b520b,P854,url,https://indiancine.ma/BKF,indiancine.ma,indiancine.ma,https://indiancine.ma/BKF,"<html><head>\n <meta charset=""utf-8"">\n...",Q58484448$E03C8ED8-0593-4D47-B002-6371900A238E,normal,...,Gul Sanobar,color,black and white,no-alias,"['colour', 'colors', 'colours', 'has color', '...","['B/W', 'B&W', 'colorless cinema', 'black-and-...",1934 film,color of subject,monochrome form in visual arts,Gul Sanobar$color$black and white


In [8]:
wtr.reference_id.unique().shape

(651,)

In [9]:
# Remove duplicates of reference and verbalisation, as duplicates arise from qualifier dependancy
wtr_duplidrop = \
    wtr.drop_duplicates(
    ['verb_mock','final_url'], keep='first'
)

wtr_duplidrop = wtr_duplidrop.drop(806, axis='index') #Verb will be equal

print('Percentage [Number] of claims dropped due to duplicated verbalisation and url pair')
print(
    f'{100 - 100*wtr_duplidrop.shape[0]/wtr.shape[0]}',
    f'[{wtr.shape[0] - wtr_duplidrop.shape[0]}]'
)

Percentage [Number] of claims dropped due to duplicated verbalisation and url pair
2.0989505247376314 [28]


In [10]:
# Remove the three cases in archinform.net written in German
wtr_duplidrop = wtr_duplidrop[
    ~wtr_duplidrop['final_url'].map(
        lambda x : 'www.archinform.net' in x and any([(y in x) for y in ['19632','11996','45859']])
    )
]

In [11]:
wtr_duplidrop = wtr_duplidrop.reset_index(drop=True)

In [12]:
wtr_duplidrop

Unnamed: 0,reference_id,reference_property_id,reference_datatype,url,netloc,netloc_agg,final_url,html,claim_id,rank,...,entity_label,property_label,object_label,entity_alias,property_alias,object_alias,entity_desc,property_desc,object_desc,verb_mock
0,390d6c6e68a32e11f8d7b0883cda0557db529fe6,P4656,url,https://en.wikipedia.org/w/index.php?title=G._...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=G._...,"<html class=""client-js ve-available"" lang=""en""...",Q5512528$81E8AD02-28AF-4AE3-8ACD-047C30B40B01,normal,...,G. V. Raja,place of death,Kullu Valley,"['G.V. Raja', 'ജി.വി. രാജ']","['deathplace', 'died in', 'death place', 'POD'...","['Kulû', 'Vallee de Kulu', 'Vallée de kulu']",sports official,most specific known (e.g. city instead of coun...,"valley in Himachal Pradesh, India",G. V. Raja$place of death$Kullu Valley
1,49b9aec8e10815611ff0379a34d5fd7c3830566e,P4656,url,https://en.wikipedia.org/w/index.php?title=Seb...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Seb...,"<html class=""client-js ve-available"" lang=""en""...",Q12149940$C9FE5F0C-78FD-4ECE-B1C2-16A763B8ED4E,normal,...,Sebastian Sabol,religion,Greek catholic church,['Sebastian Stepan Sabol'],"['religious affiliation', 'faith', 'life stanc...","['Iglesia Católica Griega', 'Iglesia catolica ...","Ukrainian Basilian priest, poet and writer (19...","religion of a person, organization or religiou...",group of Eastern Catholic Churches following t...,Sebastian Sabol$religion$Greek catholic church
2,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,P4656,url,https://en.wikipedia.org/w/index.php?title=Ave...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Ave...,"<html class=""client-js ve-available"" lang=""en""...",Q583556$929DCD2F-3B4A-41B5-89E7-EEC549077834,normal,...,Average White Band,location of formation,Dundee,['Average White Band'],"['originates from', 'comes from', 'place of fo...",['City of Dundee'],Scottish R&B band,location where a group or organization was formed,city in Scotland,Average White Band$location of formation$Dundee
3,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,P4656,url,https://en.wikipedia.org/w/index.php?title=Ave...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Ave...,"<html class=""client-js ve-available"" lang=""en""...",Q583556$2AD75C92-953E-47CB-93A6-D082674F4231,normal,...,Average White Band,work period (start),1972,['Average White Band'],"['floruit (start)', 'floruit start', 'flourish...",no-alias,Scottish R&B band,start of period during which a person or group...,no-desc,Average White Band$work period (start)$1972
4,76b04346ad57869d9e5ae1007ba8343d708ab6f9,P4656,url,https://en.wikipedia.org/w/index.php?title=Yut...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Yut...,"<html class=""client-js ve-available"" lang=""en""...",Q7586053$AD344012-A522-4A8A-AF64-16F859E2EA9A,normal,...,Yutaka Higuchi,work period (start),1983,['Ютака Хигути'],"['floruit (start)', 'floruit start', 'flourish...",no-alias,Japanese musician,start of period during which a person or group...,no-desc,Yutaka Higuchi$work period (start)$1983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,c714ca621d1fe1ca35f20e01a72587d23bdd8e1e,P854,url,https://indiancine.ma/BTW,indiancine.ma,indiancine.ma,https://indiancine.ma/BTW,"<html><head>\n <meta charset=""utf-8"">\n...",Q58485125$0B087946-63A0-4D69-BAEA-AD52DFBE2C6F,normal,...,Navjeevan,original language of film or TV show,Hindi,no-alias,"['original language', 'language of the origina...","['Hindi language', 'Modern Standard Hindi', 'hi']",1935 film,language in which a film or a performance work...,Indo-Aryan language,Navjeevan$original language of film or TV show...
1298,52968f905e88f2484e3cb43e6ab4e76f120bc3c3,P854,url,https://indiancine.ma/BTD,indiancine.ma,indiancine.ma,https://indiancine.ma/BTD,"<html><head>\n <meta charset=""utf-8"">\n...",Q58485081$AF144153-1EF3-4494-BBCC-90742E594482,normal,...,Mantra Shakti,original language of film or TV show,Bengali,no-alias,"['original language', 'language of the origina...","['Bengali language', 'Bangla', 'bn', 'ben']",1935 film,language in which a film or a performance work...,Indo-Aryan language mainly spoken in Banglades...,Mantra Shakti$original language of film or TV ...
1299,8bb93e520f7ca8208f24078434b8f5f2ba8b520b,P854,url,https://indiancine.ma/BKF,indiancine.ma,indiancine.ma,https://indiancine.ma/BKF,"<html><head>\n <meta charset=""utf-8"">\n...",Q58484448$9611AF89-1BD2-4D49-9CBB-E3EC52C7765D,normal,...,Gul Sanobar,original language of film or TV show,Hindi,no-alias,"['original language', 'language of the origina...","['Hindi language', 'Modern Standard Hindi', 'hi']",1934 film,language in which a film or a performance work...,Indo-Aryan language,Gul Sanobar$original language of film or TV sh...
1300,8bb93e520f7ca8208f24078434b8f5f2ba8b520b,P854,url,https://indiancine.ma/BKF,indiancine.ma,indiancine.ma,https://indiancine.ma/BKF,"<html><head>\n <meta charset=""utf-8"">\n...",Q58484448$E03C8ED8-0593-4D47-B002-6371900A238E,normal,...,Gul Sanobar,color,black and white,no-alias,"['colour', 'colors', 'colours', 'has color', '...","['B/W', 'B&W', 'colorless cinema', 'black-and-...",1934 film,color of subject,monochrome form in visual arts,Gul Sanobar$color$black and white


In [13]:
wtr_duplidrop.to_csv('WTR_non_filtered_non_annotated.csv', index=None)

In [14]:
wtr_trim = wtr_duplidrop.sample(frac=1, random_state=42).drop_duplicates('reference_id').sort_index().reset_index(drop=True)
wtr_trim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 648 entries, 0 to 647
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   reference_id           648 non-null    object
 1   reference_property_id  648 non-null    object
 2   reference_datatype     648 non-null    object
 3   url                    648 non-null    object
 4   netloc                 648 non-null    object
 5   netloc_agg             648 non-null    object
 6   final_url              648 non-null    object
 7   html                   648 non-null    object
 8   claim_id               648 non-null    object
 9   rank                   648 non-null    object
 10  datatype               648 non-null    object
 11  datavalue              648 non-null    object
 12  entity_id              648 non-null    object
 13  property_id            648 non-null    object
 14  entity_label           648 non-null    object
 15  property_label         

In [15]:
bad_netloc_aggs = [
    'witches.shca.ed.ac.uk', #Single infobox
    'en.isabart.org', #Single infobox
    'bechdeltest.com', #HAS API
    'npg.si.edu', #Image and single infobox
    'www.guidetopharmacology.org', #Single infobox
    'letterboxd.com', #Single infobox and has API
    'www.discogs.com', #Single infobox
    'vocab.getty.edu', #HAS JSON DUMPS
    'www.isfdb.org', #single infobox
    'www.npg.org.uk', #set of infoboxes
    'art.nationalgalleries.org', #image and single infobox
    'www.tate.org.uk', #image and single infobox
    'www.getty.edu', #HAS JSON DUMPS
    'memory-beta.fandom.com', #The portion with the information on Claims is just a long list of names and links
    'www.disease-ontology.org', #A single infobox
    'artgallery.yale.edu', #Image and a single infobox
    'www.imdb.com', #These are author pages and consist of a portrait, an infobox, and lists of movies
    'muckrack.com', #A very short infobox
    'live.dbpedia.org', #It's dbpedia, so it's mainly a huge infobox and there are dumps
    'dbpedia.org' #Same as above
]
wtr_trim_good = wtr_trim[~wtr_trim.netloc_agg.isin(bad_netloc_aggs)].reset_index(drop=True)

wtr_trim_good.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 409 entries, 0 to 408
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   reference_id           409 non-null    object
 1   reference_property_id  409 non-null    object
 2   reference_datatype     409 non-null    object
 3   url                    409 non-null    object
 4   netloc                 409 non-null    object
 5   netloc_agg             409 non-null    object
 6   final_url              409 non-null    object
 7   html                   409 non-null    object
 8   claim_id               409 non-null    object
 9   rank                   409 non-null    object
 10  datatype               409 non-null    object
 11  datavalue              409 non-null    object
 12  entity_id              409 non-null    object
 13  property_id            409 non-null    object
 14  entity_label           409 non-null    object
 15  property_label         

In [16]:
wtr_trim_good

Unnamed: 0,reference_id,reference_property_id,reference_datatype,url,netloc,netloc_agg,final_url,html,claim_id,rank,...,entity_label,property_label,object_label,entity_alias,property_alias,object_alias,entity_desc,property_desc,object_desc,verb_mock
0,390d6c6e68a32e11f8d7b0883cda0557db529fe6,P4656,url,https://en.wikipedia.org/w/index.php?title=G._...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=G._...,"<html class=""client-js ve-available"" lang=""en""...",Q5512528$81E8AD02-28AF-4AE3-8ACD-047C30B40B01,normal,...,G. V. Raja,place of death,Kullu Valley,"['G.V. Raja', 'ജി.വി. രാജ']","['deathplace', 'died in', 'death place', 'POD'...","['Kulû', 'Vallee de Kulu', 'Vallée de kulu']",sports official,most specific known (e.g. city instead of coun...,"valley in Himachal Pradesh, India",G. V. Raja$place of death$Kullu Valley
1,49b9aec8e10815611ff0379a34d5fd7c3830566e,P4656,url,https://en.wikipedia.org/w/index.php?title=Seb...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Seb...,"<html class=""client-js ve-available"" lang=""en""...",Q12149940$C9FE5F0C-78FD-4ECE-B1C2-16A763B8ED4E,normal,...,Sebastian Sabol,religion,Greek catholic church,['Sebastian Stepan Sabol'],"['religious affiliation', 'faith', 'life stanc...","['Iglesia Católica Griega', 'Iglesia catolica ...","Ukrainian Basilian priest, poet and writer (19...","religion of a person, organization or religiou...",group of Eastern Catholic Churches following t...,Sebastian Sabol$religion$Greek catholic church
2,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,P4656,url,https://en.wikipedia.org/w/index.php?title=Ave...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Ave...,"<html class=""client-js ve-available"" lang=""en""...",Q583556$2AD75C92-953E-47CB-93A6-D082674F4231,normal,...,Average White Band,work period (start),1972,['Average White Band'],"['floruit (start)', 'floruit start', 'flourish...",no-alias,Scottish R&B band,start of period during which a person or group...,no-desc,Average White Band$work period (start)$1972
3,76b04346ad57869d9e5ae1007ba8343d708ab6f9,P4656,url,https://en.wikipedia.org/w/index.php?title=Yut...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Yut...,"<html class=""client-js ve-available"" lang=""en""...",Q7586053$AD344012-A522-4A8A-AF64-16F859E2EA9A,normal,...,Yutaka Higuchi,work period (start),1983,['Ютака Хигути'],"['floruit (start)', 'floruit start', 'flourish...",no-alias,Japanese musician,start of period during which a person or group...,no-desc,Yutaka Higuchi$work period (start)$1983
4,1ee71b39caf6df395c64b436fea4895692812d38,P4656,url,https://en.wikipedia.org/w/index.php?title=Dol...,en.wikipedia.org,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Dol...,"<html class=""client-js ve-available"" lang=""en""...",Q5289513$B0532490-8FEC-4744-9D74-61238B633898,normal,...,Dolores Delirio,work period (start),1994,no-alias,"['floruit (start)', 'floruit start', 'flourish...",no-alias,grupo musical peruano,start of period during which a person or group...,no-desc,Dolores Delirio$work period (start)$1994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,e004706cba4821386dc31a86faeb680e8fff4ae7,P854,url,https://indiancine.ma/ATB,indiancine.ma,indiancine.ma,https://indiancine.ma/ATB,"<html><head>\n <meta charset=""utf-8"">\n...",Q58480207$B8B66EE1-CD20-4426-846C-4B37DC5A55A4,normal,...,Laila Majnu,color,black and white,no-alias,"['colour', 'colors', 'colours', 'has color', '...","['B/W', 'B&W', 'colorless cinema', 'black-and-...",1931 film by J. J. Madan,color of subject,monochrome form in visual arts,Laila Majnu$color$black and white
405,c714ca621d1fe1ca35f20e01a72587d23bdd8e1e,P854,url,https://indiancine.ma/BTW,indiancine.ma,indiancine.ma,https://indiancine.ma/BTW,"<html><head>\n <meta charset=""utf-8"">\n...",Q58485125$0B087946-63A0-4D69-BAEA-AD52DFBE2C6F,normal,...,Navjeevan,original language of film or TV show,Hindi,no-alias,"['original language', 'language of the origina...","['Hindi language', 'Modern Standard Hindi', 'hi']",1935 film,language in which a film or a performance work...,Indo-Aryan language,Navjeevan$original language of film or TV show...
406,52968f905e88f2484e3cb43e6ab4e76f120bc3c3,P854,url,https://indiancine.ma/BTD,indiancine.ma,indiancine.ma,https://indiancine.ma/BTD,"<html><head>\n <meta charset=""utf-8"">\n...",Q58485081$AF144153-1EF3-4494-BBCC-90742E594482,normal,...,Mantra Shakti,original language of film or TV show,Bengali,no-alias,"['original language', 'language of the origina...","['Bengali language', 'Bangla', 'bn', 'ben']",1935 film,language in which a film or a performance work...,Indo-Aryan language mainly spoken in Banglades...,Mantra Shakti$original language of film or TV ...
407,8bb93e520f7ca8208f24078434b8f5f2ba8b520b,P854,url,https://indiancine.ma/BKF,indiancine.ma,indiancine.ma,https://indiancine.ma/BKF,"<html><head>\n <meta charset=""utf-8"">\n...",Q58484448$9611AF89-1BD2-4D49-9CBB-E3EC52C7765D,normal,...,Gul Sanobar,original language of film or TV show,Hindi,no-alias,"['original language', 'language of the origina...","['Hindi language', 'Modern Standard Hindi', 'hi']",1934 film,language in which a film or a performance work...,Indo-Aryan language,Gul Sanobar$original language of film or TV sh...
