In [2]:
import json
import os
import re
import heapq
from collections import defaultdict
from collections import Counter
import pandas as pd
import urllib
from pathlib import Path
import pickle

## Step 1 Wikipedia Data Extraction

## Step 2 Dbpedia Sparql Querying

## Step 3 Dbpedia Candiate Class Extraction from Sparql Data

### Use dbpedia_candidate_class_extraction.py 
### input: info_dbpedia_v3.txt
### output: candidate_classes.txt 

## Step 4 Candidate Classes =>  Hierarchy Enrichment, UNERv1 Mapping

In [3]:
!pwd

/mnt/data/group3/wiki-play/wikiquery_v2


In [4]:
class config:
    DATASET_PATH="/mnt/data/group3/wiki-play/wikidbquery"
    WIKIPEDIA_PATH ="/mnt/data/group3/wiki-play/data/wiki"
    OUTPUT_PATH = "/mnt/data/group3/wiki-play/dataset"

In [5]:
# read the mapping file
with open('Dbpedia_UNER_v1.json') as json_file:
    UNER_Dbpedia_v1 = json.load(json_file)
    
# {
# "owl:Thing":"",
# "Activity":"",
# "Game":"Name-Product-Product_Other",
# "BoardGame":"",
# "CardGame":"",
# "Sales":"",
# "Sport":"Name-Product-Doctrine_Method-Sport",
# "Athletics":"",
# "TeamSport":"",
# "Agent":""
# }


In [6]:
UNER_Dbpedia_v1["Person"]

'Name-Person-Name'

In [7]:
with open('dbpedia_hierarchy_priority.json') as priority_file:
    Dbpedia_priority = json.load(priority_file)
    
# {
# "owl:Thing":1,
# "Activity":2,
# "Game":3,
# "BoardGame":4,
# "CardGame":4,
# "Sales":3,
# "Sport":3,
# "Athletics":4,
# "TeamSport":4,
# "Agent":2,
# "Deity":3,
# "Employer":3,
# "Family":3,
# }    


In [8]:
# read the sample file having entities and the candidate classes

# Define a mapping of wikititle with its UNER class
wikiTitle_unerClass_dict = {}

with open(os.path.join(config.DATASET_PATH,"candidate_classes.txt")) as input_file:
    for index, line in enumerate(input_file):
        line = line.strip()
        if line:
            wiki_title, *classes= line.split(",")
#             if wiki_title =="William Golding":print(classes)
            priority_queue = []
            priority_set = set()
            # proces the classes
            for cls in classes:
                # take the last part of the url => usually entity is found as last
                class_name = (cls.split("/")[-1])
                # check if the class in Dbpedia_v1 and if it is check its corresponding mapping exists, 
                # it could be blank and we ignore the blank classes
                if class_name in UNER_Dbpedia_v1 and UNER_Dbpedia_v1[class_name]:
                    #check if the class has heirachy
                    if class_name in Dbpedia_priority:
                        # make sure whatever you are adding doesnt affect the possible selection hence uniques are added
                        if class_name not in priority_set:
                            # push it into queue with its priority
                            heapq.heappush(priority_queue, (Dbpedia_priority[class_name], class_name))
                            priority_set.add(class_name)
            wikiTitle_unerClass_dict[wiki_title] = heapq.nlargest(1,priority_queue)

## Some Tests


In [9]:
"FictionalCharacter" in UNER_Dbpedia_v1 

True

In [10]:
UNER_Dbpedia_v1["FictionalCharacter"]

'Name-Person-Fictional'

In [11]:
"FictionalCharacter" in Dbpedia_priority

True

In [12]:
print("Total length of mapped entities",len(wikiTitle_unerClass_dict.keys()))

Total length of mapped entities 384902


In [13]:
print(wikiTitle_unerClass_dict["Yogi Bear"]) 
print(wikiTitle_unerClass_dict["William Golding"])

[(3, 'Person')]
[(3, 'Person')]


## Step 5 UNERv1 Back Mapping Wikipedia 

#### Make sure not to write lines containing non-Uner-mapped entities

In [14]:
!pip install beautifulsoup4 lxml nltk spacy html5lib



In [15]:
from bs4 import BeautifulSoup

In [16]:
!pip install html5lib lxml



### Split the paragraphs into lines
### Split the paragraphs into lines
### lines checking if there are UNER entities. If no entities are found we skip those sentences
### Write to a file => In line XML 
### Parallalize the process =
### Convert XML to IOB

In [17]:
from spacy.lang.en import English

nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)


In [66]:
def convert_xml_iob(line):
    ENT = set(UNER_Dbpedia_v1.values())
    ENT.remove('')    
    ne_type_re = re.compile(r'<ne type="([^"]*)">', re.U)
    markup_split_re = re.compile(r'(<[^>]*>)|(\s+)', re.U)

    line = line.strip()
    new_line = u""
    inside = 0
    markup = "O"
    for k, token in enumerate(markup_split_re.split(line)):
        if not token or not token.strip():
            continue
        #print("token",token,inside,markup)
        ne_type = ne_type_re.findall(token)
        if ne_type:
            inside += 1
            if inside== 1:
                if ne_type[0] in ENT:
                    markup = ne_type[0]
                else:
#                     markup = "OTHER"
                    markup = "O"
        elif token == "</ne>":
            inside -= 1
            if inside == 0:
                markup = "O"				
        elif not token.startswith("<") or not token.endswith(">"):
            new_line += u"{}\t{}\n".format(token, markup)
#     print(new_line)
    return new_line

### Main function: Takes in file path and writes the corresponding final value to output folder.

In [60]:
def process(wiki_file_path):
    outputs =[]
    counter_matching= 0
    with open(os.path.join(config.WIKIPEDIA_PATH,wiki_file_path)) as input_file:
        for paragraph in input_file:
            # we get paragraph by iterating through the files
            paragraph = paragraph.strip()
#                 lines = nltk.tokenize.sent_tokenize(paragraph)
            lines = [sent.text for sent in (nlp(paragraph).sents)]
            for line in lines:
                line = line.strip()
                # keep track if there are entities in the line
                any_entities_found=False
                if line :
                    soup = BeautifulSoup(line)

                    for span in soup.select('a[href]'):
                        sup = soup.new_tag('ne')
                        sup.string = span.text
                        # this is the key from wikipedia/dbpedia that will be lookedup in the UNER 
                        uner_lookup_key = urllib.parse.unquote(span.attrs['href'])
                        if  uner_lookup_key in wikiTitle_unerClass_dict and wikiTitle_unerClass_dict[uner_lookup_key] :
#                             print("=",uner_lookup_key)
#                             print("-",wikiTitle_unerClass_dict[uner_lookup_key])
                            any_entities_found=True
                            sup.attrs["type"] = UNER_Dbpedia_v1[wikiTitle_unerClass_dict[uner_lookup_key][0][1]]
                        
                            span.insert_after(sup)
                            span.clear()
                            # replace the span tag with it's contents
                            span.unwrap()

                    if soup.body:
                        soup = "".join([str(x) for x in soup.body])
                        # to keep the bs code platform independent we use the default parser which adds extra p tags to some of the sentences
                        if "<p>" == soup[:3]:
                            soup = soup[3:-4]
                        if any_entities_found:
                            counter_matching = counter_matching+1
#                             print(soup)
                            outputs.append(convert_xml_iob(soup))
    if outputs:
        
        if not os.path.exists(os.path.join(config.OUTPUT_PATH,str(Path(wiki_file_path).parent))):
            os.mkdir(os.path.join(config.OUTPUT_PATH,str(Path(wiki_file_path).parent)))  
        with open(os.path.join(config.OUTPUT_PATH,wiki_file_path),"w") as output_file:
            tokenized_list = []
            for output in outputs:
                output_file.write(output)
                output_file.write("\n")
                token_str = output.split("\n")
#                 print([token.split() for token in token_str])
                tokenized_list.append([token.split() for token in token_str])
            with open(os.path.join(config.OUTPUT_PATH,wiki_file_path)+'.pkl', 'wb') as f:
                pickle.dump(tokenized_list, f)                

#### Sample testing

In [67]:
# !less {os.path.join(config.OUTPUT_PATH,"BC/wiki_86")}

def process_test(wiki_file_path="BC/wiki_86"):
    outputs =[]
    counter_matching= 0
    with open(os.path.join(config.WIKIPEDIA_PATH,wiki_file_path)) as input_file:
        for paragraph in input_file:
            # we get paragraph by iterating through the files
            paragraph = paragraph.strip()
#                 lines = nltk.tokenize.sent_tokenize(paragraph)
            lines = [sent.text for sent in (nlp(paragraph).sents)]
            for line in lines:
                line = line.strip()
                if "Kings Dominion" in line:
#                     print(line)
                    # keep track if there are entities in the line
                    any_entities_found=False
                    if line :
                        soup = BeautifulSoup(line)

                        for span in soup.select('a[href]'):
                            print(span)
                            sup = soup.new_tag('ne')
                            sup.string = span.text
                            # this is the key from wikipedia/dbpedia that will be lookedup in the UNER 
                            uner_lookup_key = urllib.parse.unquote(span.attrs['href'])
                            print(wikiTitle_unerClass_dict[uner_lookup_key])
                            print(uner_lookup_key in wikiTitle_unerClass_dict)
                            if  uner_lookup_key in wikiTitle_unerClass_dict and wikiTitle_unerClass_dict[uner_lookup_key] :
    #                             print("=",uner_lookup_key)
    #                             print("-",wikiTitle_unerClass_dict[uner_lookup_key])
                                any_entities_found=True
                                sup.attrs["type"] = UNER_Dbpedia_v1[wikiTitle_unerClass_dict[uner_lookup_key][0][1]]
                            else:
                                sup.attrs["type"] = ""
                            span.insert_after(sup)
                            span.clear()
                            # replace the span tag with it's contents
                            span.unwrap()

                        if soup.body:
                            soup = "".join([str(x) for x in soup.body])
                            # to keep the bs code platform independent we use the default parser which adds extra p tags to some of the sentences
                            if "<p>" == soup[:3]:
                                soup = soup[3:-4]
                            print(soup)
                            if any_entities_found:
                                counter_matching = counter_matching+1
    #                             print(soup)
                                outputs.append(convert_xml_iob(soup))
        print(outputs)
process_test()     

<a href="Kings%20Dominion">Kings Dominion</a>
[]
True
<a href="Cedar%20Fair">Cedar Fair</a>
[(4, 'Company')]
True
<a href="Meadow%20Event%20Park">Meadow Event Park</a>
[(2, 'EthnicGroup')]
True
<a href="Virginia%20State%20Fair">Virginia State Fair</a>
[]
True
<ne type="">Kings Dominion</ne>, a major amusement park that is owned by <ne type="Name-Organization-Corporation-Company">Cedar Fair</ne>, and <ne type="Name-Organization-Ethnic_Group_other">Meadow Event Park</ne>, home of the <ne type="">Virginia State Fair</ne>, are located in the town.
['Kings\tO\nDominion\tO\n,\tO\na\tO\nmajor\tO\namusement\tO\npark\tO\nthat\tO\nis\tO\nowned\tO\nby\tO\nCedar\tName-Organization-Corporation-Company\nFair\tName-Organization-Corporation-Company\n,\tO\nand\tO\nMeadow\tName-Organization-Ethnic_Group_other\nEvent\tName-Organization-Ethnic_Group_other\nPark\tName-Organization-Ethnic_Group_other\n,\tO\nhome\tO\nof\tO\nthe\tO\nVirginia\tO\nState\tO\nFair\tO\n,\tO\nare\tO\nlocated\tO\nin\tO\nthe\tO\ntown

In [21]:
process("AC/wiki_50")

### Create a list of files to be passed

In [61]:

to_be_processed =[]
# list all the folders containing wikipedia processed files 
wiki_dirs = [name for name in os.listdir(config.WIKIPEDIA_PATH) if os.path.isdir(os.path.join(config.WIKIPEDIA_PATH, name)) ]
# for each folder
for wiki_dir in wiki_dirs:
    full_folder_path = os.path.join(config.WIKIPEDIA_PATH,wiki_dir)
    for wiki_file in os.listdir(full_folder_path):
#         print(os.path.join(wiki_dir,wiki_file))                               
        to_be_processed.append(os.path.join(wiki_dir,wiki_file))

## Step 6 UNERv1 Wikipedia Dataset Generation

In [None]:
# DO not run unncessarily it takes 15 cores of the system *2 threads
from multiprocessing import Pool

with Pool(15) as p:
    print(p.map(process, to_be_processed))







































































































































































































































































































































































































































































































































































































































































































































































In [23]:
#sample example of eating all the cores

```python
    from multiprocessing import Pool

    def f(x):
        return x*x


    with Pool(15) as p:
        print(p.map(f, [1, 2, 3,5,6]))
```

In [24]:
set(UNER_Dbpedia_v1.values())

{'',
 'Name-Color',
 'Name-Disease',
 'Name-Event-Historical_Event',
 'Name-Event-Natural_Phenomenon-Natural-Earthquake',
 'Name-Event-Natural_Phenomenon-Natural-Phenomenon_Other',
 'Name-Event-Occasion-Attack',
 'Name-Event-Occasion-Conference',
 'Name-Event-Occasion-Game',
 'Name-Event-Occasion-MilitaryConflict',
 'Name-Event-Occasion_Other',
 'Name-Event-Personal',
 'Name-Facility-Archaeological_Place-Archaeological_Place_Other',
 'Name-Facility-Facility_Other',
 'Name-Facility-GOE-Airport',
 'Name-Facility-GOE-Amusement_Park',
 'Name-Facility-GOE-GOE_Other',
 'Name-Facility-GOE-Market',
 'Name-Facility-GOE-Museum',
 'Name-Facility-GOE-Park',
 'Name-Facility-GOE-Port',
 'Name-Facility-GOE-School',
 'Name-Facility-GOE-Sports_Facility',
 'Name-Facility-GOE-Station',
 'Name-Facility-GOE-Theater',
 'Name-Facility-GOE-Worship_Place',
 'Name-Facility-GOE-Zoo',
 'Name-Facility-Line-Bridge',
 'Name-Facility-Line-Canal',
 'Name-Facility-Line-Railroad',
 'Name-Facility-Line-Road',
 'Name-Faci

### Test : Check if the pickle file is loadable

In [70]:
with open(os.path.join(config.OUTPUT_PATH,"BC/wiki_86")+'.pkl', "rb") as fp:   # Unpickling
    b = pickle.load(fp)
    for i in enumerate(b):
        print(b[0])

[['The', 'O'], ['Westside', 'O'], ['is', 'O'], ['a', 'O'], ['district', 'O'], ['of', 'O'], ['the', 'O'], ['city', 'O'], ['centre', 'O'], ['of', 'O'], ['Birmingham', 'O'], [',', 'O'], ['England', 'O'], [',', 'O'], ['which', 'O'], ['includes', 'O'], ['many', 'O'], ['new', 'O'], ['and', 'O'], ['planned', 'O'], ['buildings', 'O'], ['such', 'O'], ['as', 'O'], ['The', 'O'], ['Cube', 'O'], [',', 'O'], ['Library', 'Name-Facility-GOE-GOE_Other'], ['of', 'Name-Facility-GOE-GOE_Other'], ['Birmingham', 'Name-Facility-GOE-GOE_Other'], ['and', 'O'], ['Regal', 'O'], ['Tower', 'O'], ['.', 'O'], []]
[['The', 'O'], ['Westside', 'O'], ['is', 'O'], ['a', 'O'], ['district', 'O'], ['of', 'O'], ['the', 'O'], ['city', 'O'], ['centre', 'O'], ['of', 'O'], ['Birmingham', 'O'], [',', 'O'], ['England', 'O'], [',', 'O'], ['which', 'O'], ['includes', 'O'], ['many', 'O'], ['new', 'O'], ['and', 'O'], ['planned', 'O'], ['buildings', 'O'], ['such', 'O'], ['as', 'O'], ['The', 'O'], ['Cube', 'O'], [',', 'O'], ['Library', 

## Step 7 UNERv1 Wikipedia Dataset Class Statistics

In [39]:
### How many sentences do we have ?
some_tokens = []
lines =0
wiki_dirs = [name for name in os.listdir(config.OUTPUT_PATH) if os.path.isdir(os.path.join(config.OUTPUT_PATH, name)) ]
# for each folder
for wiki_dir in wiki_dirs:
    full_folder_path = os.path.join(config.OUTPUT_PATH,wiki_dir)
    for wiki_file in os.listdir(full_folder_path):
        if not wiki_file.endswith(".pkl"):
            #load the file
            with open(os.path.join(full_folder_path,wiki_file)+'.pkl', "rb") as fp:   # Unpickling
                b = pickle.load(fp)
                for i in b:
                    lines +=1
                    for j in i:
                        if len(j)>1:
                            some_tokens.append(j[1])
print(lines)

2683254


In [40]:
from collections import Counter
cnt = Counter(some_tokens)
cnt

Counter({'O': 61351623,
         'Name-Location-Country': 113465,
         'Name-Person-Name': 2452439,
         'Name-Organization-Organization_Other': 375768,
         'Name-Location-Region_Domestic_Region': 3419,
         'Name-Facility-GOE-GOE_Other': 156249,
         'Name-Location-GPE-GPE_Other': 373766,
         'Name-Location-GPE-City': 105016,
         'Name-Event-Occasion-MilitaryConflict': 178345,
         'Name-Product-Product_Other': 130716,
         'Name-Product-Printing-Printing_Other': 39900,
         'Name-Facility-GOE-School': 245879,
         'Name-Product-Art-Book': 152756,
         'Name-Facility-Line-Road': 77803,
         'Name-Product-Drug-Substance': 4799,
         'Name-Disease': 28516,
         'Name-Product-Award': 96317,
         'Name-Product-Language-National_Language': 31468,
         'Name-Organization-Political_Organization-Military': 124452,
         'Name-Product-Printing-Newspaper': 42317,
         'Name-Organization-Political_Organization-Governme

In [None]:
### How many tagged entities do we have ?


In [None]:
### How many O do we have ?


In [None]:
### What is the class distribution ?

## Step 7 UNERv1 Wikipedia Dataset Class Balancing - Pruning of Unwanted Classes

## Observations


A	O
c	O
c	O
o	O
r	O
d	O
i	O
n	O
g	O
t	O
o	O
s	O
o	O
m	O
e	O
a	O
c	O
c	O
o	O
u	O
n	O
t	O
s	O
,	O
h	O
e	O
h	O
a	O
d	O
m	O
a	O
r	O
r	O
i	O
e	O
d	O
M	OTHER
e	OTHER
d	OTHER
e	OTHER
a	OTHER
i	O
n	O
l	O
i	O
f	O
e	O
,	O
s	O
o	O
t	O
h	O
a	O
t	O
a	O
f	O
t	O
e	O
r	O
b	O
o	O
t	O
h	O
t	O
h	O
e	O
i	O
r	O
d	O
e	O
a	O
t	O
h	O
s	O
t	O
h	O
e	O
y	O
w	O
e	O
r	O
e	O
u	O
n	O
i	O
t	O
e	O
d	O
i	O
n	O
t	O
h	O
e	O
E	OTHER
l	OTHER
y	OTHER
s	OTHER
i	OTHER
a	OTHER
n	OTHER
F	OTHER
i	OTHER
e	OTHER
l	OTHER
d	OTHER
s	OTHER
o	O
f	O
H	OTHER
a	OTHER
d	OTHER
e	OTHER
s	OTHER
–	O
a	O
s	O
H	O
e	O
r	O
a	O
p	O
r	O
o	O
m	O
i	O
s	O
e	O
d	O
T	O
h	O
e	O
t	O
i	O
s	O
i	O
n	O
A	OTHER
p	OTHER
o	OTHER
l	OTHER
l	OTHER
o	OTHER
n	OTHER
i	OTHER
u	OTHER
s	OTHER
'	O
"	O
A	OTHER
r	OTHER
g	OTHER
o	OTHER
n	OTHER
a	OTHER
u	OTHER
t	OTHER
i	OTHER
c	OTHER
a	OTHER
"	O
(	O
3	O
r	O
d	O
c	O
e	O
n	O
t	O
u	O
r	O
y	O
B	O
C	O
)	O
.	O



'A\tO\nc\tO\nc\tO\no\tO\nr\tO\nd\tO\ni\tO\nn\tO\ng\tO\nt\tO\no\tO\ns\tO\no\tO\nm\tO\ne\tO\na\tO\nc\tO\nc\tO\no\tO\nu\tO\nn\tO\nt\tO\ns\tO\n,\tO\nh\tO\ne\tO\nh\tO\na\tO\nd\tO\nm\tO\na\tO\nr\tO\nr\tO\ni\tO\ne\tO\nd\tO\nM\tOTHER\ne\tOTHER\nd\tOTHER\ne\tOTHER\na\tOTHER\ni\tO\nn\tO\nl\tO\ni\tO\nf\tO\ne\tO\n,\tO\ns\tO\no\tO\nt\tO\nh\tO\na\tO\nt\tO\na\tO\nf\tO\nt\tO\ne\tO\nr\tO\nb\tO\no\tO\nt\tO\nh\tO\nt\tO\nh\tO\ne\tO\ni\tO\nr\tO\nd\tO\ne\tO\na\tO\nt\tO\nh\tO\ns\tO\nt\tO\nh\tO\ne\tO\ny\tO\nw\tO\ne\tO\nr\tO\ne\tO\nu\tO\nn\tO\ni\tO\nt\tO\ne\tO\nd\tO\ni\tO\nn\tO\nt\tO\nh\tO\ne\tO\nE\tOTHER\nl\tOTHER\ny\tOTHER\ns\tOTHER\ni\tOTHER\na\tOTHER\nn\tOTHER\nF\tOTHER\ni\tOTHER\ne\tOTHER\nl\tOTHER\nd\tOTHER\ns\tOTHER\no\tO\nf\tO\nH\tOTHER\na\tOTHER\nd\tOTHER\ne\tOTHER\ns\tOTHER\n–\tO\na\tO\ns\tO\nH\tO\ne\tO\nr\tO\na\tO\np\tO\nr\tO\no\tO\nm\tO\ni\tO\ns\tO\ne\tO\nd\tO\nT\tO\nh\tO\ne\tO\nt\tO\ni\tO\ns\tO\ni\tO\nn\tO\nA\tOTHER\np\tOTHER\no\tOTHER\nl\tOTHER\nl\tOTHER\no\tOTHER\nn\tOTHER\ni\tOTHER\nu\tOTHER\ns

In [59]:
# https://stackoverflow.com/questions/27006463/beautiful-soup-4-how-to-replace-a-tag-with-text-and-another-tag
line = '<a href="Italic%20type">Italic type</a> is commonly used to mark emphasis or more generally to distinguish one part of a text from the rest (set in Roman type).'
soup = BeautifulSoup(line)

for span in soup.select('a[href]'):
    sup = soup.new_tag('ne')
    sup.string = span.text
    # this is the key from wikipedia/dbpedia that will be lookedup in the UNER 
    uner_lookup_key = urllib.parse.unquote(span.attrs['href'])
    print(wikiTitle_unerClass_dict[uner_lookup_key][0][1])
    if  uner_lookup_key in wikiTitle_unerClass_dict:
        any_entities_found=True
        sup.attrs["type"] = wikiTitle_unerClass_dict[uner_lookup_key][0][1]
    else:
        sup.attrs["type"] = ""
    span.insert_after(sup)
    span.clear()
    # replace the span tag with it's contents
    span.unwrap()
print(soup)
soup = "".join([str(x) for x in soup.body])
print(soup)
# check if there were really any entities
# if there were send it for conversion to IOB
# if any_entities_found:
#     print("found")
#     convert_xml_iob(soup)
    # you can write it to file 

Software
<html><body><ne type="Software">Italic type</ne> is commonly used to mark emphasis or more generally to distinguish one part of a text from the rest (set in Roman type).</body></html>
<ne type="Software">Italic type</ne> is commonly used to mark emphasis or more generally to distinguish one part of a text from the rest (set in Roman type).


In [42]:
"".join([str(x) for x in soup.body.children.next])

AttributeError: 'list_iterator' object has no attribute 'next'

In [116]:

# tmp = ['http://www.w3.org/2002/07/owl#Thing', 'http://xmlns.com/foaf/0.1/Person', 'http://dbpedia.org/ontology/Person', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Agent', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#NaturalPerson', 'http://www.wikidata.org/entity/Q215627', 'http://www.wikidata.org/entity/Q24229398', 'http://www.wikidata.org/entity/Q5', 'http://www.wikidata.org/entity/Q95074', 'http://dbpedia.org/ontology/Agent', 'http://dbpedia.org/ontology/FictionalCharacter', 'http://schema.org/Person', 'http://umbel.org/umbel/rc/FictionalCharacter', 'http://dbpedia.org/class/yago/WikicatFictionalAnthropomorphicCharacters', 'http://dbpedia.org/class/yago/WikicatFictionalCharactersIntroducedIn1958', 'http://dbpedia.org/class/yago/WikicatFirst-runSyndicatedTelevisionProgramsInTheUnitedStates', 'http://dbpedia.org/class/yago/Ability105616246', 'http://dbpedia.org/class/yago/Abstraction100002137', 'http://dbpedia.org/class/yago/Act100030358', 'http://dbpedia.org/class/yago/Action100037396', 'http://dbpedia.org/class/yago/Beginning100235435', 'http://dbpedia.org/class/yago/Broadcast106619428', 'http://dbpedia.org/class/yago/Change100191142', 'http://dbpedia.org/class/yago/ChangeOfState100199130', 'http://dbpedia.org/class/yago/Cognition100023271', 'http://dbpedia.org/class/yago/Creativity105624700', 'http://dbpedia.org/class/yago/Ending106308765', 'http://dbpedia.org/class/yago/Event100029378', 'http://dbpedia.org/class/yago/FictionalCharacter109587565', 'http://dbpedia.org/class/yago/ImaginaryBeing109483738', 'http://dbpedia.org/class/yago/Imagination105625465', 'http://dbpedia.org/class/yago/Introduction100238022', 'http://dbpedia.org/class/yago/LanguageUnit106284225', 'http://dbpedia.org/class/yago/Morpheme106306233', 'http://dbpedia.org/class/yago/Part113809207', 'http://dbpedia.org/class/yago/PsychologicalFeature100023100', 'http://dbpedia.org/class/yago/Relation100031921', 'http://dbpedia.org/class/yago/Show106619065', 'http://dbpedia.org/class/yago/SocialEvent107288639', 'http://dbpedia.org/class/yago/TelevisionProgram106620579', 'http://dbpedia.org/class/yago/Wikicat1958TelevisionSeriesDebuts', 'http://dbpedia.org/class/yago/Wikicat1961AmericanTelevisionSeriesDebuts', 'http://dbpedia.org/class/yago/Wikicat1963AmericanTelevisionSeriesEndings', 'http://dbpedia.org/class/yago/YagoPermanentlyLocatedEntity', 'http://dbpedia.org/class/yago/WikicatAnimatedCharacters']
tmp = ['http://www.w3.org/2002/07/owl#Thing', 'http://xmlns.com/foaf/0.1/Person', 'http://dbpedia.org/ontology/Person', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Agent', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#NaturalPerson', 'http://www.wikidata.org/entity/Q215627', 'http://www.wikidata.org/entity/Q24229398', 'http://www.wikidata.org/entity/Q36180', 'http://www.wikidata.org/entity/Q5', 'http://dbpedia.org/ontology/Agent', 'http://dbpedia.org/ontology/Writer', 'http://schema.org/Person', 'http://dbpedia.org/class/yago/WikicatPeopleFromCornwall', 'http://dbpedia.org/class/yago/WikicatRoyalNavySailors', 'http://dbpedia.org/class/yago/Abstraction100002137', 'http://dbpedia.org/class/yago/Acquirer109764201', 'http://dbpedia.org/class/yago/Adult109605289', 'http://dbpedia.org/class/yago/Alumnus109786338', 'http://dbpedia.org/class/yago/CausalAgent100007347', 'http://dbpedia.org/class/yago/CommandingOfficer109941964', 'http://dbpedia.org/class/yago/Communication100033020', 'http://dbpedia.org/class/yago/Communicator109610660', 'http://dbpedia.org/class/yago/Dramatist110030277', 'http://dbpedia.org/class/yago/Educator110045713', 'http://dbpedia.org/class/yago/Fiction106367107', 'http://dbpedia.org/class/yago/Gambler110118844', 'http://dbpedia.org/class/yago/Honoree110183757', 'http://dbpedia.org/class/yago/Intellectual109621545', 'http://dbpedia.org/class/yago/Laureate110249011', 'http://dbpedia.org/class/yago/LiteraryComposition106364329', 'http://dbpedia.org/class/yago/LivingThing100004258', 'http://dbpedia.org/class/yago/MilitaryOfficer110317007', 'http://dbpedia.org/class/yago/Novel106367879', 'http://dbpedia.org/class/yago/Novelette106368962', 'http://dbpedia.org/class/yago/Novelist110363573', 'http://dbpedia.org/class/yago/Object100002684', 'http://dbpedia.org/class/yago/Organism100004475', 'http://dbpedia.org/class/yago/Person100007846', 'http://dbpedia.org/class/yago/PhysicalEntity100001930', 'http://dbpedia.org/class/yago/Poet110444194', 'http://dbpedia.org/class/yago/PrizeWinner109627807', 'http://dbpedia.org/class/yago/Professional110480253', 'http://dbpedia.org/class/yago/Recipient109627906', 'http://dbpedia.org/class/yago/Sailor110546633', 'http://dbpedia.org/class/yago/Scholar110557854', 'http://dbpedia.org/class/yago/Schoolteacher110560352', 'http://dbpedia.org/class/yago/Serviceman110582746', 'http://dbpedia.org/class/yago/SkilledWorker110605985', 'http://dbpedia.org/class/yago/Teacher110694258', 'http://dbpedia.org/class/yago/Whole100003553', 'http://dbpedia.org/class/yago/Wikicat1971Novels', 'http://dbpedia.org/class/yago/Winner110782791', 'http://dbpedia.org/class/yago/Worker109632518', 'http://dbpedia.org/class/yago/Writer110794014', 'http://dbpedia.org/class/yago/Writing106362953', 'http://dbpedia.org/class/yago/WrittenCommunication106349220', 'http://dbpedia.org/class/yago/YagoLegalActor', 'http://dbpedia.org/class/yago/YagoLegalActorGeo', 'http://dbpedia.org/class/yago/WikicatCommandersOfTheOrderOfTheBritishEmpire', 'http://dbpedia.org/class/yago/WikicatCornishDramatistsAndPlaywrights', 'http://dbpedia.org/class/yago/WikicatCornishNovelists', 'http://dbpedia.org/class/yago/WikicatCornishPoets', 'http://dbpedia.org/class/yago/WikicatCornishWriters', 'http://dbpedia.org/class/yago/WikicatEnglish-languageWriters', 'http://dbpedia.org/class/yago/WikicatEnglishNobelLaureates', 'http://dbpedia.org/class/yago/WikicatEnglishNovelists', 'http://dbpedia.org/class/yago/WikicatEnglishPeople', 'http://dbpedia.org/class/yago/WikicatEnglishWriters', 'http://dbpedia.org/class/yago/Wikicat20th-centuryBritishNovelists', 'http://dbpedia.org/class/yago/Wikicat20th-centuryNovelists', 'http://dbpedia.org/class/yago/WikicatAlumniOfBrasenoseCollege', 'Oxford', 'http://dbpedia.org/class/yago/WikicatAlumniOfTheUniversityOfOxford', 'http://dbpedia.org/class/yago/WikicatBookerPrizeWinners', 'http://dbpedia.org/class/yago/WikicatBritishNobelLaureates', 'http://dbpedia.org/class/yago/WikicatBritishNovellas', 'http://dbpedia.org/class/yago/WikicatBritishPeople', 'http://dbpedia.org/class/yago/WikicatBritishSchoolteachers', 'http://dbpedia.org/class/yago/WikicatBritishScienceFictionWriters', 'http://dbpedia.org/class/yago/WikicatBritishWriters', 'http://dbpedia.org/class/yago/WikicatWriters', 'http://dbpedia.org/class/yago/WikicatMaritimeWriters', 'http://dbpedia.org/class/yago/WikicatNobelLaureatesInLiterature', 'http://dbpedia.org/class/yago/WikicatNovelsByWilliamGolding', 'http://dbpedia.org/class/yago/WikicatOfficersOfTheOrderOfTheBritishEmpire', 'http://dbpedia.org/class/yago/WikicatPeopleFromMarlborough', 'http://dbpedia.org/class/yago/WikicatPeopleFromNewquay']
priority_queue = []
priority_set =set()
# proces the classes
for cls in tmp:
    # take the last part of the url => usually entity is found as last
    class_name = (cls.split("/")[-1])
    print(class_name)
    # check if the class in Dbpedia_v1 and if it is check its corresponding mapping exists, 
    # it could be blank and we ignore the blank classes
    if class_name in UNER_Dbpedia_v1 and UNER_Dbpedia_v1[class_name]:
        #check if the class has heirachy
        if class_name in Dbpedia_priority:
#                         make sure whatever you are adding doesnt affect the possible selection hence uniques are added
            if class_name not in priority_set or True:
                #push it into queue with its priority
                priority_set.add(class_name)
                heapq.heappush(priority_queue, (Dbpedia_priority[class_name], class_name))
print(priority_queue)
# For Yogi bear
# [(3, 'FictionalCharacter'), (3, 'FictionalCharacter'), (3, 'Person'), (3, 'Person'), (3, 'Person')]

owl#Thing
Person
Person
DUL.owl#Agent
DUL.owl#NaturalPerson
Q215627
Q24229398
Q36180
Q5
Agent
Writer
Person
WikicatPeopleFromCornwall
WikicatRoyalNavySailors
Abstraction100002137
Acquirer109764201
Adult109605289
Alumnus109786338
CausalAgent100007347
CommandingOfficer109941964
Communication100033020
Communicator109610660
Dramatist110030277
Educator110045713
Fiction106367107
Gambler110118844
Honoree110183757
Intellectual109621545
Laureate110249011
LiteraryComposition106364329
LivingThing100004258
MilitaryOfficer110317007
Novel106367879
Novelette106368962
Novelist110363573
Object100002684
Organism100004475
Person100007846
PhysicalEntity100001930
Poet110444194
PrizeWinner109627807
Professional110480253
Recipient109627906
Sailor110546633
Scholar110557854
Schoolteacher110560352
Serviceman110582746
SkilledWorker110605985
Teacher110694258
Whole100003553
Wikicat1971Novels
Winner110782791
Worker109632518
Writer110794014
Writing106362953
WrittenCommunication106349220
YagoLegalActor
YagoLegalActor