In [27]:
import json
import os
import re
import heapq
from collections import defaultdict
from collections import Counter
import pandas as pd
import urllib
from pathlib import Path


## Step 1 Wikipedia Data Extraction

## Step 2 Dbpedia Sparql Querying

## Step 3 Dbpedia Candiate Class Extraction from Sparql Data

### Use dbpedia_candidate_class_extraction.py 
### input: info_dbpedia_v3.txt
### output: candidate_classes.txt 

## Step 4 Candidate Classes =>  Hierarchy Enrichment, UNERv1 Mapping

In [4]:
!pwd

/mnt/data/group3/wiki-play/wikiquery_v2


In [5]:
class config:
    DATASET_PATH="/mnt/data/group3/wiki-play/wikidbquery"
    WIKIPEDIA_PATH ="/mnt/data/group3/wiki-play/data/wiki"
    OUTPUT_PATH = "/mnt/data/group3/wiki-play/dataset"

In [6]:
# read the mapping file
with open('Dbpedia_UNER_v1.json') as json_file:
    UNER_Dbpedia_v1 = json.load(json_file)
    
# {
# "owl:Thing":"",
# "Activity":"",
# "Game":"Name-Product-Product_Other",
# "BoardGame":"",
# "CardGame":"",
# "Sales":"",
# "Sport":"Name-Product-Doctrine_Method-Sport",
# "Athletics":"",
# "TeamSport":"",
# "Agent":""
# }


In [7]:
UNER_Dbpedia_v1["Person"]

'Name-Person-Name'

In [8]:
with open('dbpedia_hierarchy_priority.json') as priority_file:
    Dbpedia_priority = json.load(priority_file)
    
# {
# "owl:Thing":1,
# "Activity":2,
# "Game":3,
# "BoardGame":4,
# "CardGame":4,
# "Sales":3,
# "Sport":3,
# "Athletics":4,
# "TeamSport":4,
# "Agent":2,
# "Deity":3,
# "Employer":3,
# "Family":3,
# }    


In [9]:
# read the sample file having entities and the candidate classes

# Define a mapping of wikititle with its UNER class
wikiTitle_unerClass_dict = {}

with open(os.path.join(config.DATASET_PATH,"candidate_classes.txt")) as input_file:
    for index, line in enumerate(input_file):
        line = line.strip()
        if line:
            wiki_title, *classes= line.split(",")
#             if wiki_title =="William Golding":print(classes)
            priority_queue = []
            priority_set = set()
            # proces the classes
            for cls in classes:
                # take the last part of the url => usually entity is found as last
                class_name = (cls.split("/")[-1])
                # check if the class in Dbpedia_v1 and if it is check its corresponding mapping exists, 
                # it could be blank and we ignore the blank classes
                if class_name in UNER_Dbpedia_v1 and UNER_Dbpedia_v1[class_name]:
                    #check if the class has heirachy
                    if class_name in Dbpedia_priority:
                        # make sure whatever you are adding doesnt affect the possible selection hence uniques are added
                        if class_name not in priority_set:
                            # push it into queue with its priority
                            heapq.heappush(priority_queue, (Dbpedia_priority[class_name], class_name))
                            priority_set.add(class_name)
            wikiTitle_unerClass_dict[wiki_title] = heapq.nlargest(1,priority_queue)

## Some Tests


In [10]:
"FictionalCharacter" in UNER_Dbpedia_v1 

True

In [11]:
UNER_Dbpedia_v1["FictionalCharacter"]

'Name-Person-Fictional'

In [12]:
"FictionalCharacter" in Dbpedia_priority

True

In [13]:
print("Total length of mapped entities",len(wikiTitle_unerClass_dict.keys()))

Total length of mapped entities 384902


In [14]:
print(wikiTitle_unerClass_dict["Yogi Bear"]) 
print(wikiTitle_unerClass_dict["William Golding"])

[(3, 'Person')]
[(3, 'Person')]


## Step 5 UNERv1 Back Mapping Wikipedia 

#### Make sure not to write lines containing non-Uner-mapped entities

In [15]:
!pip install beautifulsoup4 lxml nltk spacy html5lib



In [16]:
from bs4 import BeautifulSoup

In [17]:
!pip install html5lib lxml



### Split the paragraphs into lines
### Split the paragraphs into lines
### lines checking if there are UNER entities. If no entities are found we skip those sentences
### Write to a file => In line XML 
### Parallalize the process =
### Convert XML to IOB

In [18]:
# import nltk
# nltk.download('punkt')

In [19]:
from spacy.lang.en import English

nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)


In [20]:
[] == True

False

In [21]:
def process(wiki_file_path):
    with open(os.path.join(config.WIKIPEDIA_PATH,wiki_file_path)) as input_file:
        for paragraph in input_file:
            # we get paragraph by iterating through the files
            paragraph = paragraph.strip()
#                 lines = nltk.tokenize.sent_tokenize(paragraph)
            lines = [sent.text for sent in (nlp(paragraph).sents)]
            for line in lines:
                line = line.strip()
                # keep track if there are entities in the line
                any_entities_found=False
                if line :
                    soup = BeautifulSoup(line)

                    for span in soup.select('a[href]'):
                        sup = soup.new_tag('ne')
                        sup.string = span.text
                        # this is the key from wikipedia/dbpedia that will be lookedup in the UNER 
                        uner_lookup_key = urllib.parse.unquote(span.attrs['href'])
                        if  uner_lookup_key in wikiTitle_unerClass_dict and wikiTitle_unerClass_dict[uner_lookup_key] :
                            print("=",uner_lookup_key)
                            print("-",wikiTitle_unerClass_dict[uner_lookup_key])
                            any_entities_found=True
                            sup.attrs["type"] = UNER_Dbpedia_v1[wikiTitle_unerClass_dict[uner_lookup_key][0][1]]
                        else:
                            sup.attrs["type"] = ""
                        span.insert_after(sup)
                        span.clear()
                        # replace the span tag with it's contents
                        span.unwrap()

                    if soup.body:
                        soup = "".join([str(x) for x in soup.body])
                        # to keep the bs code platform independent we use the default parser which adds extra p tags to some of the sentences
                        if "<p>" == soup[:3]:
                            soup = soup[3:-4]
                        if any_entities_found:
                            counter_matching = counter_matching+1
                            print(soup)
                            convert_xml_iob(soup)
                            if not os.path.exists(os.path.join(config.OUTPUT_PATH,str(Path(wiki_file_path).parent))):
                                os.mkdirs(os.path.join(config.OUTPUT_PATH,str(Path(wiki_file_path).parent)))  
                            with open(os.path.join(config.OUTPUT_PATH,wiki_file_path)) as output_file:
                                
                            # you can write it to file 
#                             print(soup)

In [35]:
os.path.join(config.OUTPUT_PATH,str(Path("AC/wiki_50").parent))


'/mnt/data/group3/wiki-play/dataset/AC'

In [26]:

counter_matching = 0
to_be_processed =[]
# list all the folders containing wikipedia processed files 
wiki_dirs = [name for name in os.listdir(config.WIKIPEDIA_PATH) if os.path.isdir(os.path.join(config.WIKIPEDIA_PATH, name)) ]
# for each folder
for wiki_dir in wiki_dirs:
    full_folder_path = os.path.join(config.WIKIPEDIA_PATH,wiki_dir)
    for wiki_file in os.listdir(full_folder_path):
        print(os.path.join(wiki_dir,wiki_file))                               
#         to_be_processed.append(os.path.join(wiki_dir,wiki_file))
                        
        break
    break

    

AA/wiki_00
AA/wiki_01
AA/wiki_02
AA/wiki_03
AA/wiki_04
AA/wiki_05
AA/wiki_06
AA/wiki_07
AA/wiki_08
AA/wiki_09
AA/wiki_10
AA/wiki_11
AA/wiki_12
AA/wiki_13
AA/wiki_14
AA/wiki_15
AA/wiki_16
AA/wiki_17
AA/wiki_18
AA/wiki_19
AA/wiki_20
AA/wiki_21
AA/wiki_22
AA/wiki_23
AA/wiki_24
AA/wiki_25
AA/wiki_26
AA/wiki_27
AA/wiki_28
AA/wiki_29
AA/wiki_30
AA/wiki_31
AA/wiki_32
AA/wiki_33
AA/wiki_34
AA/wiki_35
AA/wiki_36
AA/wiki_37
AA/wiki_38
AA/wiki_39
AA/wiki_40
AA/wiki_41
AA/wiki_42
AA/wiki_43
AA/wiki_44
AA/wiki_45
AA/wiki_46
AA/wiki_47
AA/wiki_48
AA/wiki_49
AA/wiki_50
AA/wiki_51
AA/wiki_52
AA/wiki_53
AA/wiki_54
AA/wiki_55
AA/wiki_56
AA/wiki_57
AA/wiki_58
AA/wiki_59
AA/wiki_60
AA/wiki_61
AA/wiki_62
AA/wiki_63
AA/wiki_64
AA/wiki_65
AA/wiki_66
AA/wiki_67
AA/wiki_68
AA/wiki_69
AA/wiki_70
AA/wiki_71
AA/wiki_72
AA/wiki_73
AA/wiki_74
AA/wiki_75
AA/wiki_76
AA/wiki_77
AA/wiki_78
AA/wiki_79
AA/wiki_80
AA/wiki_81
AA/wiki_82
AA/wiki_83
AA/wiki_84
AA/wiki_85
AA/wiki_86
AA/wiki_87
AA/wiki_88
AA/wiki_89
AA/wiki_90

BP/wiki_00
BP/wiki_01
BP/wiki_02
BP/wiki_03
BP/wiki_04
BP/wiki_05
BP/wiki_06
BP/wiki_07
BP/wiki_08
BP/wiki_09
BP/wiki_10
BP/wiki_11
BP/wiki_12
BP/wiki_13
BP/wiki_14
BP/wiki_15
BP/wiki_16
BP/wiki_17
BP/wiki_18
BP/wiki_19
BP/wiki_20
BP/wiki_21
BP/wiki_22
BP/wiki_23
BP/wiki_24
BP/wiki_25
BP/wiki_26
BP/wiki_27
BP/wiki_28
BP/wiki_29
BP/wiki_30
BP/wiki_31
BP/wiki_32
BP/wiki_33
BP/wiki_34
BP/wiki_35
BP/wiki_36
BP/wiki_37
BP/wiki_38
BP/wiki_39
BP/wiki_40
BP/wiki_41
BP/wiki_42
BP/wiki_43
BP/wiki_44
BP/wiki_45
BP/wiki_46
BP/wiki_47
BP/wiki_48
BP/wiki_49
BP/wiki_50
BP/wiki_51
BP/wiki_52
BP/wiki_53
BP/wiki_54
BP/wiki_55
BP/wiki_56
BP/wiki_57
BP/wiki_58
BP/wiki_59
BP/wiki_60
BP/wiki_61
BP/wiki_62
BP/wiki_63
BP/wiki_64
BP/wiki_65
BP/wiki_66
BP/wiki_67
BP/wiki_68
BP/wiki_69
BP/wiki_70
BP/wiki_71
BP/wiki_72
BP/wiki_73
BP/wiki_74
BP/wiki_75
BP/wiki_76
BP/wiki_77
BP/wiki_78
BP/wiki_79
BP/wiki_80
BP/wiki_81
BP/wiki_82
BP/wiki_83
BP/wiki_84
BP/wiki_85
BP/wiki_86
BP/wiki_87
BP/wiki_88
BP/wiki_89
BP/wiki_90

DE/wiki_00
DE/wiki_01
DE/wiki_02
DE/wiki_03
DE/wiki_04
DE/wiki_05
DE/wiki_06
DE/wiki_07
DE/wiki_08
DE/wiki_09
DE/wiki_10
DE/wiki_11
DE/wiki_12
DE/wiki_13
DE/wiki_14
DE/wiki_15
DE/wiki_16
DE/wiki_17
DE/wiki_18
DE/wiki_19
DE/wiki_20
DE/wiki_21
DE/wiki_22
DE/wiki_23
DE/wiki_24
DE/wiki_25
DE/wiki_26
DE/wiki_27
DE/wiki_28
DE/wiki_29
DE/wiki_30
DE/wiki_31
DE/wiki_32
DE/wiki_33
DE/wiki_34
DE/wiki_35
DE/wiki_36
DE/wiki_37
DE/wiki_38
DE/wiki_39
DE/wiki_40
DE/wiki_41
DE/wiki_42
DE/wiki_43
DE/wiki_44
DE/wiki_45
DE/wiki_46
DE/wiki_47
DE/wiki_48
DE/wiki_49
DE/wiki_50
DE/wiki_51
DE/wiki_52
DE/wiki_53
DE/wiki_54
DE/wiki_55
DE/wiki_56
DE/wiki_57
DE/wiki_58
DE/wiki_59
DE/wiki_60
DE/wiki_61
DE/wiki_62
DE/wiki_63
DE/wiki_64
DE/wiki_65
DE/wiki_66
DE/wiki_67
DE/wiki_68
DE/wiki_69
DE/wiki_70
DE/wiki_71
DE/wiki_72
DE/wiki_73
DE/wiki_74
DE/wiki_75
DE/wiki_76
DE/wiki_77
DE/wiki_78
DE/wiki_79
DE/wiki_80
DE/wiki_81
DE/wiki_82
DE/wiki_83
DE/wiki_84
DE/wiki_85
DE/wiki_86
DE/wiki_87
DE/wiki_88
DE/wiki_89
DE/wiki_90

ET/wiki_00
ET/wiki_01
ET/wiki_02
ET/wiki_03
ET/wiki_04
ET/wiki_05
ET/wiki_06
ET/wiki_07
ET/wiki_08
ET/wiki_09
ET/wiki_10
ET/wiki_11
ET/wiki_12
ET/wiki_13
ET/wiki_14
ET/wiki_15
ET/wiki_16
ET/wiki_17
ET/wiki_18
ET/wiki_19
ET/wiki_20
ET/wiki_21
ET/wiki_22
ET/wiki_23
ET/wiki_24
ET/wiki_25
ET/wiki_26
ET/wiki_27
ET/wiki_28
ET/wiki_29
ET/wiki_30
ET/wiki_31
ET/wiki_32
ET/wiki_33
ET/wiki_34
ET/wiki_35
ET/wiki_36
ET/wiki_37
ET/wiki_38
ET/wiki_39
ET/wiki_40
ET/wiki_41
ET/wiki_42
ET/wiki_43
ET/wiki_44
ET/wiki_45
ET/wiki_46
ET/wiki_47
ET/wiki_48
ET/wiki_49
ET/wiki_50
ET/wiki_51
ET/wiki_52
ET/wiki_53
ET/wiki_54
ET/wiki_55
ET/wiki_56
ET/wiki_57
ET/wiki_58
ET/wiki_59
ET/wiki_60
ET/wiki_61
ET/wiki_62
ET/wiki_63
ET/wiki_64
ET/wiki_65
ET/wiki_66
ET/wiki_67
ET/wiki_68
ET/wiki_69
ET/wiki_70
ET/wiki_71
ET/wiki_72
ET/wiki_73
ET/wiki_74
ET/wiki_75
ET/wiki_76
ET/wiki_77
ET/wiki_78
ET/wiki_79
ET/wiki_80
ET/wiki_81
ET/wiki_82
ET/wiki_83
ET/wiki_84
ET/wiki_85
ET/wiki_86
ET/wiki_87
ET/wiki_88
ET/wiki_89
ET/wiki_90

GI/wiki_00
GI/wiki_01
GI/wiki_02
GI/wiki_03
GI/wiki_04
GI/wiki_05
GI/wiki_06
GI/wiki_07
GI/wiki_08
GI/wiki_09
GI/wiki_10
GI/wiki_11
GI/wiki_12
GI/wiki_13
GI/wiki_14
GI/wiki_15
GI/wiki_16
GI/wiki_17
GI/wiki_18
GI/wiki_19
GI/wiki_20
GI/wiki_21
GI/wiki_22
GI/wiki_23
GI/wiki_24
GI/wiki_25
GI/wiki_26
GI/wiki_27
GI/wiki_28
GI/wiki_29
GI/wiki_30
GI/wiki_31
GI/wiki_32
GI/wiki_33
GI/wiki_34
GI/wiki_35
GI/wiki_36
GI/wiki_37
GI/wiki_38
GI/wiki_39
GI/wiki_40
GI/wiki_41
GI/wiki_42
GI/wiki_43
GI/wiki_44
GI/wiki_45
GI/wiki_46
GI/wiki_47
GI/wiki_48
GI/wiki_49
GI/wiki_50
GI/wiki_51
GI/wiki_52
GI/wiki_53
GI/wiki_54
GI/wiki_55
GI/wiki_56
GI/wiki_57
GI/wiki_58
GI/wiki_59
GI/wiki_60
GI/wiki_61
GI/wiki_62
GI/wiki_63
GI/wiki_64
GI/wiki_65
GI/wiki_66
GI/wiki_67
GI/wiki_68
GI/wiki_69
GI/wiki_70
GI/wiki_71
GI/wiki_72
GI/wiki_73
GI/wiki_74
GI/wiki_75
GI/wiki_76
GI/wiki_77
GI/wiki_78
GI/wiki_79
GI/wiki_80
GI/wiki_81
GI/wiki_82
GI/wiki_83
GI/wiki_84
GI/wiki_85
GI/wiki_86
GI/wiki_87
GI/wiki_88
GI/wiki_89
GI/wiki_90

In [84]:
counter_matching

8

In [None]:
#                         soup = BeautifulSoup(line,"html.parser")
#                         for span in soup.select('a'):
#                             # insert sup tag after the span
# #                             print(dir(span))
# #                             print(span.text)
#                             if urllib.parse.unquote(span.attrs['href']) in wikiTitle_unerClass_dict:
#                                 any_entities_found=True
#                             sup = soup.new_tag('ne')
#                             sup.string = span.text
#                             sup.attrs["type"] =urllib.parse.unquote(span.attrs['href'])
#                             span.insert_after(sup)
#                             # replace the span tag with it's contents
#                             span.unwrap()
                # check if there were really any entities
                # if there were send it for conversion to IOB

In [76]:
from multiprocessing import Pool

def f(x):
    return x*x


with Pool(15) as p:
    print(p.map(f, [1, 2, 3,5,6]))

[1, 4, 9, 25, 36]


In [76]:
# TOOD  Replace this dictionary with the Unique values from UNERv1 

def convert_xml_iob(line):
    #we dont need mapping as our diego already gave us a mapper
#     ENT = {"P":"PER","pp":"PER","p_":"PER","pf":"PER","ps":"PER","pb":"PER","pm":"PER","pc":"PER","pd":"PER",
#                  "ia":"ORG","if":"ORG","io":"ORG","ic":"ORG","i_":"ORG",
#                  "G":"LOC","gc":"LOC","gh":"LOC","gr":"LOC","gq":"LOC","gl":"LOC","gu":"LOC","gt":"LOC","gs":"LOC","g_":"LOC",
#              "A":"LOC","a_":"LOC","ah":"LOC","az":"LOC", "lower": "O", "cap": "O"}
    ENT = set(UNER_Dbpedia_v1.values())
    # There was empty '' in the values
#     ENT.remove('')

    ne_type_re = re.compile(r'<ne type="([^"]*)">', re.U)
    markup_split_re = re.compile(r'(<[^>]*>)|(\s+)', re.U)

    line = line.strip()
    new_line = u""
    inside = 0
    markup = "O"
    for k, token in enumerate(markup_split_re.split(line)):
        if not token or not token.strip():
            continue
        #print("token",token,inside,markup)
        ne_type = ne_type_re.findall(token)
        if ne_type:
            inside += 1
            if inside== 1:
                if ne_type[0] in ENT:
                    markup = ne_type[0]
                else:
                    markup = "OTHER"
        elif token == "</ne>":
            inside -= 1
            if inside == 0:
                markup = "O"				
        elif not token.startswith("<") or not token.endswith(">"):
            new_line += u"{}\t{}\n".format(token, markup)
    print(new_line)
    return new_line

In [75]:
set(UNER_Dbpedia_v1.values())

{'',
 'Name-Color',
 'Name-Disease',
 'Name-Event-Historical_Event',
 'Name-Event-Natural_Phenomenon-Natural-Earthquake',
 'Name-Event-Natural_Phenomenon-Natural-Phenomenon_Other',
 'Name-Event-Occasion-Attack',
 'Name-Event-Occasion-Conference',
 'Name-Event-Occasion-Game',
 'Name-Event-Occasion-MilitaryConflict',
 'Name-Event-Occasion_Other',
 'Name-Event-Personal',
 'Name-Facility-Archaeological_Place-Archaeological_Place_Other',
 'Name-Facility-Facility_Other',
 'Name-Facility-GOE-Airport',
 'Name-Facility-GOE-Amusement_Park',
 'Name-Facility-GOE-GOE_Other',
 'Name-Facility-GOE-Market',
 'Name-Facility-GOE-Museum',
 'Name-Facility-GOE-Park',
 'Name-Facility-GOE-Port',
 'Name-Facility-GOE-School',
 'Name-Facility-GOE-Sports_Facility',
 'Name-Facility-GOE-Station',
 'Name-Facility-GOE-Theater',
 'Name-Facility-GOE-Worship_Place',
 'Name-Facility-GOE-Zoo',
 'Name-Facility-Line-Bridge',
 'Name-Facility-Line-Canal',
 'Name-Facility-Line-Railroad',
 'Name-Facility-Line-Road',
 'Name-Faci

In [74]:
data ='Anarchism is a <a href="political philosophy">political philosophy</a> and <a href="Political movement">movement</a> that rejects all involuntary, coercive forms of <a href="hierarchy">hierarchy</a>. It <a href="Radical politics">radically</a> calls for the abolition of the <a href="State (polity)">state</a> which it holds to be undesirable, unnecessary and harmful.'
data = '''According to some accounts, he had married <ne type="">Medea</ne> in life, so that after both their deaths they were united in the <ne type="">Elysian Fields</ne> of <ne type="">Hades</ne> – as Hera promised Thetis in <ne type="Person">Apollonius</ne>' "<ne type="">Argonautica</ne>" (3rd century BC).'''
convert_xml_iob(data)
# soup = BeautifulSoup(data)

# for span in soup.select('a[href]'):
#     sup = soup.new_tag('ne')
#     sup.string = span.text
#     # this is the key from wikipedia/dbpedia that will be lookedup in the UNER 
#     uner_lookup_key = urllib.parse.unquote(span.attrs['href'])
#     if  uner_lookup_key in wikiTitle_unerClass_dict:
#         any_entities_found=True
#         sup.attrs["type"] = wikiTitle_unerClass_dict[uner_lookup_key]
#     else:
#         sup.attrs["type"] = ""
#     span.insert_after(sup)
#     span.clear()
#     # replace the span tag with it's contents
#     span.unwrap()
# print(soup)
# if soup.body:
#     soup = "".join([str(x) for x in soup.body.children])

A	O
c	O
c	O
o	O
r	O
d	O
i	O
n	O
g	O
t	O
o	O
s	O
o	O
m	O
e	O
a	O
c	O
c	O
o	O
u	O
n	O
t	O
s	O
,	O
h	O
e	O
h	O
a	O
d	O
m	O
a	O
r	O
r	O
i	O
e	O
d	O
M	OTHER
e	OTHER
d	OTHER
e	OTHER
a	OTHER
i	O
n	O
l	O
i	O
f	O
e	O
,	O
s	O
o	O
t	O
h	O
a	O
t	O
a	O
f	O
t	O
e	O
r	O
b	O
o	O
t	O
h	O
t	O
h	O
e	O
i	O
r	O
d	O
e	O
a	O
t	O
h	O
s	O
t	O
h	O
e	O
y	O
w	O
e	O
r	O
e	O
u	O
n	O
i	O
t	O
e	O
d	O
i	O
n	O
t	O
h	O
e	O
E	OTHER
l	OTHER
y	OTHER
s	OTHER
i	OTHER
a	OTHER
n	OTHER
F	OTHER
i	OTHER
e	OTHER
l	OTHER
d	OTHER
s	OTHER
o	O
f	O
H	OTHER
a	OTHER
d	OTHER
e	OTHER
s	OTHER
–	O
a	O
s	O
H	O
e	O
r	O
a	O
p	O
r	O
o	O
m	O
i	O
s	O
e	O
d	O
T	O
h	O
e	O
t	O
i	O
s	O
i	O
n	O
A	OTHER
p	OTHER
o	OTHER
l	OTHER
l	OTHER
o	OTHER
n	OTHER
i	OTHER
u	OTHER
s	OTHER
'	O
"	O
A	OTHER
r	OTHER
g	OTHER
o	OTHER
n	OTHER
a	OTHER
u	OTHER
t	OTHER
i	OTHER
c	OTHER
a	OTHER
"	O
(	O
3	O
r	O
d	O
c	O
e	O
n	O
t	O
u	O
r	O
y	O
B	O
C	O
)	O
.	O



'A\tO\nc\tO\nc\tO\no\tO\nr\tO\nd\tO\ni\tO\nn\tO\ng\tO\nt\tO\no\tO\ns\tO\no\tO\nm\tO\ne\tO\na\tO\nc\tO\nc\tO\no\tO\nu\tO\nn\tO\nt\tO\ns\tO\n,\tO\nh\tO\ne\tO\nh\tO\na\tO\nd\tO\nm\tO\na\tO\nr\tO\nr\tO\ni\tO\ne\tO\nd\tO\nM\tOTHER\ne\tOTHER\nd\tOTHER\ne\tOTHER\na\tOTHER\ni\tO\nn\tO\nl\tO\ni\tO\nf\tO\ne\tO\n,\tO\ns\tO\no\tO\nt\tO\nh\tO\na\tO\nt\tO\na\tO\nf\tO\nt\tO\ne\tO\nr\tO\nb\tO\no\tO\nt\tO\nh\tO\nt\tO\nh\tO\ne\tO\ni\tO\nr\tO\nd\tO\ne\tO\na\tO\nt\tO\nh\tO\ns\tO\nt\tO\nh\tO\ne\tO\ny\tO\nw\tO\ne\tO\nr\tO\ne\tO\nu\tO\nn\tO\ni\tO\nt\tO\ne\tO\nd\tO\ni\tO\nn\tO\nt\tO\nh\tO\ne\tO\nE\tOTHER\nl\tOTHER\ny\tOTHER\ns\tOTHER\ni\tOTHER\na\tOTHER\nn\tOTHER\nF\tOTHER\ni\tOTHER\ne\tOTHER\nl\tOTHER\nd\tOTHER\ns\tOTHER\no\tO\nf\tO\nH\tOTHER\na\tOTHER\nd\tOTHER\ne\tOTHER\ns\tOTHER\n–\tO\na\tO\ns\tO\nH\tO\ne\tO\nr\tO\na\tO\np\tO\nr\tO\no\tO\nm\tO\ni\tO\ns\tO\ne\tO\nd\tO\nT\tO\nh\tO\ne\tO\nt\tO\ni\tO\ns\tO\ni\tO\nn\tO\nA\tOTHER\np\tOTHER\no\tOTHER\nl\tOTHER\nl\tOTHER\no\tOTHER\nn\tOTHER\ni\tOTHER\nu\tOTHER\ns

## Step 6 UNERv1 Wikipedia Dataset Generation

## Step 7 UNERv1 Wikipedia Dataset Class Statistics

## Step 7 UNERv1 Wikipedia Dataset Class Balancing - Pruning of Unwanted Classes

## Observations


In [59]:
# https://stackoverflow.com/questions/27006463/beautiful-soup-4-how-to-replace-a-tag-with-text-and-another-tag
line = '<a href="Italic%20type">Italic type</a> is commonly used to mark emphasis or more generally to distinguish one part of a text from the rest (set in Roman type).'
soup = BeautifulSoup(line)

for span in soup.select('a[href]'):
    sup = soup.new_tag('ne')
    sup.string = span.text
    # this is the key from wikipedia/dbpedia that will be lookedup in the UNER 
    uner_lookup_key = urllib.parse.unquote(span.attrs['href'])
    print(wikiTitle_unerClass_dict[uner_lookup_key][0][1])
    if  uner_lookup_key in wikiTitle_unerClass_dict:
        any_entities_found=True
        sup.attrs["type"] = wikiTitle_unerClass_dict[uner_lookup_key][0][1]
    else:
        sup.attrs["type"] = ""
    span.insert_after(sup)
    span.clear()
    # replace the span tag with it's contents
    span.unwrap()
print(soup)
soup = "".join([str(x) for x in soup.body])
print(soup)
# check if there were really any entities
# if there were send it for conversion to IOB
# if any_entities_found:
#     print("found")
#     convert_xml_iob(soup)
    # you can write it to file 

Software
<html><body><ne type="Software">Italic type</ne> is commonly used to mark emphasis or more generally to distinguish one part of a text from the rest (set in Roman type).</body></html>
<ne type="Software">Italic type</ne> is commonly used to mark emphasis or more generally to distinguish one part of a text from the rest (set in Roman type).


In [42]:
"".join([str(x) for x in soup.body.children.next])

AttributeError: 'list_iterator' object has no attribute 'next'

In [116]:

# tmp = ['http://www.w3.org/2002/07/owl#Thing', 'http://xmlns.com/foaf/0.1/Person', 'http://dbpedia.org/ontology/Person', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Agent', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#NaturalPerson', 'http://www.wikidata.org/entity/Q215627', 'http://www.wikidata.org/entity/Q24229398', 'http://www.wikidata.org/entity/Q5', 'http://www.wikidata.org/entity/Q95074', 'http://dbpedia.org/ontology/Agent', 'http://dbpedia.org/ontology/FictionalCharacter', 'http://schema.org/Person', 'http://umbel.org/umbel/rc/FictionalCharacter', 'http://dbpedia.org/class/yago/WikicatFictionalAnthropomorphicCharacters', 'http://dbpedia.org/class/yago/WikicatFictionalCharactersIntroducedIn1958', 'http://dbpedia.org/class/yago/WikicatFirst-runSyndicatedTelevisionProgramsInTheUnitedStates', 'http://dbpedia.org/class/yago/Ability105616246', 'http://dbpedia.org/class/yago/Abstraction100002137', 'http://dbpedia.org/class/yago/Act100030358', 'http://dbpedia.org/class/yago/Action100037396', 'http://dbpedia.org/class/yago/Beginning100235435', 'http://dbpedia.org/class/yago/Broadcast106619428', 'http://dbpedia.org/class/yago/Change100191142', 'http://dbpedia.org/class/yago/ChangeOfState100199130', 'http://dbpedia.org/class/yago/Cognition100023271', 'http://dbpedia.org/class/yago/Creativity105624700', 'http://dbpedia.org/class/yago/Ending106308765', 'http://dbpedia.org/class/yago/Event100029378', 'http://dbpedia.org/class/yago/FictionalCharacter109587565', 'http://dbpedia.org/class/yago/ImaginaryBeing109483738', 'http://dbpedia.org/class/yago/Imagination105625465', 'http://dbpedia.org/class/yago/Introduction100238022', 'http://dbpedia.org/class/yago/LanguageUnit106284225', 'http://dbpedia.org/class/yago/Morpheme106306233', 'http://dbpedia.org/class/yago/Part113809207', 'http://dbpedia.org/class/yago/PsychologicalFeature100023100', 'http://dbpedia.org/class/yago/Relation100031921', 'http://dbpedia.org/class/yago/Show106619065', 'http://dbpedia.org/class/yago/SocialEvent107288639', 'http://dbpedia.org/class/yago/TelevisionProgram106620579', 'http://dbpedia.org/class/yago/Wikicat1958TelevisionSeriesDebuts', 'http://dbpedia.org/class/yago/Wikicat1961AmericanTelevisionSeriesDebuts', 'http://dbpedia.org/class/yago/Wikicat1963AmericanTelevisionSeriesEndings', 'http://dbpedia.org/class/yago/YagoPermanentlyLocatedEntity', 'http://dbpedia.org/class/yago/WikicatAnimatedCharacters']
tmp = ['http://www.w3.org/2002/07/owl#Thing', 'http://xmlns.com/foaf/0.1/Person', 'http://dbpedia.org/ontology/Person', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Agent', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#NaturalPerson', 'http://www.wikidata.org/entity/Q215627', 'http://www.wikidata.org/entity/Q24229398', 'http://www.wikidata.org/entity/Q36180', 'http://www.wikidata.org/entity/Q5', 'http://dbpedia.org/ontology/Agent', 'http://dbpedia.org/ontology/Writer', 'http://schema.org/Person', 'http://dbpedia.org/class/yago/WikicatPeopleFromCornwall', 'http://dbpedia.org/class/yago/WikicatRoyalNavySailors', 'http://dbpedia.org/class/yago/Abstraction100002137', 'http://dbpedia.org/class/yago/Acquirer109764201', 'http://dbpedia.org/class/yago/Adult109605289', 'http://dbpedia.org/class/yago/Alumnus109786338', 'http://dbpedia.org/class/yago/CausalAgent100007347', 'http://dbpedia.org/class/yago/CommandingOfficer109941964', 'http://dbpedia.org/class/yago/Communication100033020', 'http://dbpedia.org/class/yago/Communicator109610660', 'http://dbpedia.org/class/yago/Dramatist110030277', 'http://dbpedia.org/class/yago/Educator110045713', 'http://dbpedia.org/class/yago/Fiction106367107', 'http://dbpedia.org/class/yago/Gambler110118844', 'http://dbpedia.org/class/yago/Honoree110183757', 'http://dbpedia.org/class/yago/Intellectual109621545', 'http://dbpedia.org/class/yago/Laureate110249011', 'http://dbpedia.org/class/yago/LiteraryComposition106364329', 'http://dbpedia.org/class/yago/LivingThing100004258', 'http://dbpedia.org/class/yago/MilitaryOfficer110317007', 'http://dbpedia.org/class/yago/Novel106367879', 'http://dbpedia.org/class/yago/Novelette106368962', 'http://dbpedia.org/class/yago/Novelist110363573', 'http://dbpedia.org/class/yago/Object100002684', 'http://dbpedia.org/class/yago/Organism100004475', 'http://dbpedia.org/class/yago/Person100007846', 'http://dbpedia.org/class/yago/PhysicalEntity100001930', 'http://dbpedia.org/class/yago/Poet110444194', 'http://dbpedia.org/class/yago/PrizeWinner109627807', 'http://dbpedia.org/class/yago/Professional110480253', 'http://dbpedia.org/class/yago/Recipient109627906', 'http://dbpedia.org/class/yago/Sailor110546633', 'http://dbpedia.org/class/yago/Scholar110557854', 'http://dbpedia.org/class/yago/Schoolteacher110560352', 'http://dbpedia.org/class/yago/Serviceman110582746', 'http://dbpedia.org/class/yago/SkilledWorker110605985', 'http://dbpedia.org/class/yago/Teacher110694258', 'http://dbpedia.org/class/yago/Whole100003553', 'http://dbpedia.org/class/yago/Wikicat1971Novels', 'http://dbpedia.org/class/yago/Winner110782791', 'http://dbpedia.org/class/yago/Worker109632518', 'http://dbpedia.org/class/yago/Writer110794014', 'http://dbpedia.org/class/yago/Writing106362953', 'http://dbpedia.org/class/yago/WrittenCommunication106349220', 'http://dbpedia.org/class/yago/YagoLegalActor', 'http://dbpedia.org/class/yago/YagoLegalActorGeo', 'http://dbpedia.org/class/yago/WikicatCommandersOfTheOrderOfTheBritishEmpire', 'http://dbpedia.org/class/yago/WikicatCornishDramatistsAndPlaywrights', 'http://dbpedia.org/class/yago/WikicatCornishNovelists', 'http://dbpedia.org/class/yago/WikicatCornishPoets', 'http://dbpedia.org/class/yago/WikicatCornishWriters', 'http://dbpedia.org/class/yago/WikicatEnglish-languageWriters', 'http://dbpedia.org/class/yago/WikicatEnglishNobelLaureates', 'http://dbpedia.org/class/yago/WikicatEnglishNovelists', 'http://dbpedia.org/class/yago/WikicatEnglishPeople', 'http://dbpedia.org/class/yago/WikicatEnglishWriters', 'http://dbpedia.org/class/yago/Wikicat20th-centuryBritishNovelists', 'http://dbpedia.org/class/yago/Wikicat20th-centuryNovelists', 'http://dbpedia.org/class/yago/WikicatAlumniOfBrasenoseCollege', 'Oxford', 'http://dbpedia.org/class/yago/WikicatAlumniOfTheUniversityOfOxford', 'http://dbpedia.org/class/yago/WikicatBookerPrizeWinners', 'http://dbpedia.org/class/yago/WikicatBritishNobelLaureates', 'http://dbpedia.org/class/yago/WikicatBritishNovellas', 'http://dbpedia.org/class/yago/WikicatBritishPeople', 'http://dbpedia.org/class/yago/WikicatBritishSchoolteachers', 'http://dbpedia.org/class/yago/WikicatBritishScienceFictionWriters', 'http://dbpedia.org/class/yago/WikicatBritishWriters', 'http://dbpedia.org/class/yago/WikicatWriters', 'http://dbpedia.org/class/yago/WikicatMaritimeWriters', 'http://dbpedia.org/class/yago/WikicatNobelLaureatesInLiterature', 'http://dbpedia.org/class/yago/WikicatNovelsByWilliamGolding', 'http://dbpedia.org/class/yago/WikicatOfficersOfTheOrderOfTheBritishEmpire', 'http://dbpedia.org/class/yago/WikicatPeopleFromMarlborough', 'http://dbpedia.org/class/yago/WikicatPeopleFromNewquay']
priority_queue = []
priority_set =set()
# proces the classes
for cls in tmp:
    # take the last part of the url => usually entity is found as last
    class_name = (cls.split("/")[-1])
    print(class_name)
    # check if the class in Dbpedia_v1 and if it is check its corresponding mapping exists, 
    # it could be blank and we ignore the blank classes
    if class_name in UNER_Dbpedia_v1 and UNER_Dbpedia_v1[class_name]:
        #check if the class has heirachy
        if class_name in Dbpedia_priority:
#                         make sure whatever you are adding doesnt affect the possible selection hence uniques are added
            if class_name not in priority_set or True:
                #push it into queue with its priority
                priority_set.add(class_name)
                heapq.heappush(priority_queue, (Dbpedia_priority[class_name], class_name))
print(priority_queue)
# For Yogi bear
# [(3, 'FictionalCharacter'), (3, 'FictionalCharacter'), (3, 'Person'), (3, 'Person'), (3, 'Person')]

owl#Thing
Person
Person
DUL.owl#Agent
DUL.owl#NaturalPerson
Q215627
Q24229398
Q36180
Q5
Agent
Writer
Person
WikicatPeopleFromCornwall
WikicatRoyalNavySailors
Abstraction100002137
Acquirer109764201
Adult109605289
Alumnus109786338
CausalAgent100007347
CommandingOfficer109941964
Communication100033020
Communicator109610660
Dramatist110030277
Educator110045713
Fiction106367107
Gambler110118844
Honoree110183757
Intellectual109621545
Laureate110249011
LiteraryComposition106364329
LivingThing100004258
MilitaryOfficer110317007
Novel106367879
Novelette106368962
Novelist110363573
Object100002684
Organism100004475
Person100007846
PhysicalEntity100001930
Poet110444194
PrizeWinner109627807
Professional110480253
Recipient109627906
Sailor110546633
Scholar110557854
Schoolteacher110560352
Serviceman110582746
SkilledWorker110605985
Teacher110694258
Whole100003553
Wikicat1971Novels
Winner110782791
Worker109632518
Writer110794014
Writing106362953
WrittenCommunication106349220
YagoLegalActor
YagoLegalActor