# Imports and Functions

In [1]:
import pandas as pd
import numpy as np
import spacy
import requests
import spacy_dbpedia_spotlight
from bs4 import BeautifulSoup
from geopy.distance import geodesic

In [2]:
def find_lat_long(raw_str):
    temp = raw_str.strip('POINT(').strip(')').split()
    lat = float(temp[0])
    long = float(temp[1])
    
    return lat, long

In [3]:
def centeroidnp(arr):
    length = arr.shape[0]
    sum_x = np.sum(arr[:, 0])
    sum_y = np.sum(arr[:, 1])
    
    return [sum_x/length, sum_y/length]

In [4]:
def find_shortest_geodistance(lat_long_list_1, lat_long_list_2):
    dist_list = []
    
    for c_i in lat_long_list_1:
        for c_j in lat_long_list_2:
            dist_list.append(geodesic(c_i, c_j))
            
    dist_list.sort()
    
    try:
        shortest = dist_list[0]
    except:
        shortest = None
        
    return shortest

# Load Data

In [5]:
df = pd.read_csv('./data/predicted_label_first_two_sentence_all_label.csv')

# DBpedia Spotlight for SpaCy

In [6]:
# load your model as usual
nlp = spacy.load('en_core_web_sm')
# add the pipeline stage
nlp.add_pipe('dbpedia_spotlight')

<spacy_dbpedia_spotlight.entity_linker.EntityLinker at 0x4a639d30>

In [7]:
# get the document
doc = nlp('place of birth Taiwan')
# see the entities
print('Entities', [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
print('')
# inspect the raw data from DBpedia spotlight
print(doc.ents[0]._.dbpedia_raw_result)

Entities [('Taiwan', 'DBPEDIA_ENT', 'http://dbpedia.org/resource/Taiwan')]

{'@URI': 'http://dbpedia.org/resource/Taiwan', '@support': '70226', '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country', '@surfaceForm': 'Taiwan', '@offset': '15', '@similarityScore': '0.9997260518544173', '@percentageOfSecondRank': '1.2189605528416695E-4'}


In [56]:
# for i in range(len(doc.ents)):
#     print(doc.ents[i]._.dbpedia_raw_result)
#     print('')
    
# test_sentence = df.iloc[0]['raw_sentence']
# test_claim = df.iloc[0]['wikidata']
# test_entity_sentence = nlp(test_sentence)
# test_entity_claim = nlp(test_claim)

# print(test_entity_sentence.ents)
# print(test_claim)
# print(test_entity_sentence.ents[1]._.dbpedia_raw_result['@URI'])

# for i in range(len(test_entity_claim.ents)):
#     print(test_entity_claim.ents[i]._.dbpedia_raw_result)
#     print('')
    
# for i in range(len(test_entity_sentence.ents)):
#     print(test_entity_sentence.ents[i]._.dbpedia_raw_result)
#     print('')

# Task

### Find Geo Points

In [13]:
df_0 = df.sort_values(by=['0_prob'], ascending=False)[:50]
df_1 = df.sort_values(by=['1_prob'], ascending=False)[:50]
df_0.reset_index(drop=True, inplace=True)
df_1.reset_index(drop=True, inplace=True)

In [15]:
centroid_sentence_0 = []
centroid_claim_0 = []
centroid_sentence_1 = []
centroid_claim_1 = []

In [16]:
# label = 0
# run through each sentence-claim pair
for i in range(len(df_0)):
    temp_sentence = df_0.iloc[i]['raw_sentence']
    temp_claim = df_0.iloc[i]['wikidata']
    temp_entity_sentence = nlp(temp_sentence)
    temp_entity_claim = nlp(temp_claim)
    temp_centroids_sentence = []
    temp_centroids_claim = []
    
    # run through all dbpedia entities in the sentence
    for j in range(len(temp_entity_sentence.ents)):
        # calculate the centroid of the place if the entity has latitude and longtitude information
        try:
            link = temp_entity_sentence.ents[j]._.dbpedia_raw_result['@URI']
            temp_source = requests.get(link)
            temp_soup_all = BeautifulSoup(temp_source.text, 'lxml')
            temp_list = []
            
            # find all geometry points
            for span in temp_soup_all.find_all('span', class_='literal'):
                try:
                    long, lat = find_lat_long(span.find('span', property="geo:geometry").text)
                    temp_list.append([lat, long])
                except:
                    continue
                    
            temp_array = np.asarray(temp_list)
            temp_centroids_sentence.append(centeroidnp(temp_array))
            
        except:
            continue
    centroid_sentence_0.append(temp_centroids_sentence)
    
    
    # run through all dbpedia entities in the claim
    for k in range(len(temp_entity_claim.ents)):
        # calculate the centroid of the place if the entity has latitude and longtitude information
        try:
            link = temp_entity_claim.ents[k]._.dbpedia_raw_result['@URI']
            temp_source = requests.get(link)
            temp_soup_all = BeautifulSoup(temp_source.text, 'lxml')
            temp_list = []
            
            # find all geometry points
            for span in temp_soup_all.find_all('span', class_='literal'):
                try:
                    long, lat = find_lat_long(span.find('span', property="geo:geometry").text)
                    temp_list.append([lat, long])
                except:
                    continue
                    
            temp_array = np.asarray(temp_list)
            temp_centroids_claim.append(centeroidnp(temp_array))
            
        except:
            continue
    centroid_claim_0.append(temp_centroids_claim)

In [19]:
df_0['sentence_centroid'] = centroid_sentence_0
df_0['claim_centroid'] = centroid_claim_0

In [24]:
# label = 1
# run through each sentence-claim pair
for i in range(len(df_1)):
    temp_sentence = df_1.iloc[i]['raw_sentence']
    temp_claim = df_1.iloc[i]['wikidata']
    temp_entity_sentence = nlp(temp_sentence)
    temp_entity_claim = nlp(temp_claim)
    temp_centroids_sentence = []
    temp_centroids_claim = []
    
    # run through all dbpedia entities in the sentence
    for j in range(len(temp_entity_sentence.ents)):
        # calculate the centroid of the place if the entity has latitude and longtitude information
        try:
            link = temp_entity_sentence.ents[j]._.dbpedia_raw_result['@URI']
            temp_source = requests.get(link)
            temp_soup_all = BeautifulSoup(temp_source.text, 'lxml')
            temp_list = []
            
            # find all geometry points
            for span in temp_soup_all.find_all('span', class_='literal'):
                try:
                    long, lat = find_lat_long(span.find('span', property="geo:geometry").text)
                    temp_list.append([lat, long])
                except:
                    continue
                    
            temp_array = np.asarray(temp_list)
            temp_centroids_sentence.append(centeroidnp(temp_array))
            
        except:
            continue
    centroid_sentence_1.append(temp_centroids_sentence)
    
    
    # run through all dbpedia entities in the claim
    for k in range(len(temp_entity_claim.ents)):
        # calculate the centroid of the place if the entity has latitude and longtitude information
        try:
            link = temp_entity_claim.ents[k]._.dbpedia_raw_result['@URI']
            temp_source = requests.get(link)
            temp_soup_all = BeautifulSoup(temp_source.text, 'lxml')
            temp_list = []
            
            # find all geometry points
            for span in temp_soup_all.find_all('span', class_='literal'):
                try:
                    long, lat = find_lat_long(span.find('span', property="geo:geometry").text)
                    temp_list.append([lat, long])
                except:
                    continue
                    
            temp_array = np.asarray(temp_list)
            temp_centroids_claim.append(centeroidnp(temp_array))
            
        except:
            continue
    centroid_claim_1.append(temp_centroids_claim)

146 161 {'@URI': 'http://dbpedia.org/resource/John_Meyendorff', '@support': '64', '@types': '', '@surfaceForm': 'John Meyendorff', '@offset': '146', '@similarityScore': '1.0', '@percentageOfSecondRank': '0.0'}


In [25]:
df_1['sentence_centroid'] = centroid_sentence_1
df_1['claim_centroid'] = centroid_claim_1

### Find Shortest

In [48]:
shortest_0_list = []

for i in range(len(df_0)):
    temp_list_i = df_0.iloc[i]['sentence_centroid']
    temp_list_j = df_0.iloc[i]['claim_centroid']
    shortest_0_list.append(find_shortest_geodistance(temp_list_i, temp_list_j))
    
df_0['shortest'] = shortest_0_list

In [49]:
shortest_1_list = []

for i in range(len(df_1)):
    temp_list_i = df_1.iloc[i]['sentence_centroid']
    temp_list_j = df_1.iloc[i]['claim_centroid']
    shortest_1_list.append(find_shortest_geodistance(temp_list_i, temp_list_j))
    
df_1['shortest'] = shortest_1_list

In [50]:
df_0

Unnamed: 0,sentence,wikidata,label,raw_sentence,0_prob,1_prob,2_prob,sentence_centroid,claim_centroid,shortest
0,"he was born in blue island, illinois, raised i...",place of birth Tinley Park,,"He was born in Blue Island, Illinois, raised i...",0.989592,0.004328,0.00608,"[[41.657222747803, -87.680000305176], [41.5738...","[[41.573890686035, -87.803886413574]]",0.0 km
1,dong yu (chinese: 董宇; born 15 july 1994 in qin...,place of birth Qingdao,,Dong Yu (Chinese: 董宇; born 15 July 1994 in Qin...,0.988216,0.003877,0.007907,"[[36.066898345947, 120.38269805908]]","[[36.066898345947, 120.38269805908]]",0.0 km
2,elisabeth clara heath-sladen (1 february 1946s...,place of birth Liverpool,,Elisabeth Clara Heath-Sladen (1 February 1946S...,0.987689,0.004745,0.007565,[],"[[53.400001525879, -2.9833333492279]]",
3,fløgstad was born in the industrial city of sa...,place of birth Sauda,,Fløgstad was born in the industrial city of Sa...,0.987566,0.006371,0.006063,"[[59.6875, 6.4372220039368], [59.227500915527,...","[[59.6875, 6.4372220039368]]",0.0 km
4,he was born 10 december 1934 at yunlin county ...,"place of birth Beigang, Yunlin",,He was born 10 December 1934 at Yunlin County ...,0.987284,0.005804,0.006913,"[[23.704889297485, 120.47606658936], [24.53333...","[[23.566667556763, 120.30000305176]]",23.602633904718154 km
5,"he was born in yilan, taiwan.lan cheng-lung at...",place of birth Luodong,,"He was born in Yilan, Taiwan.Lan Cheng-lung at...",0.98724,0.004438,0.008323,[],"[[24.676683425903, 121.76692199707]]",
6,1786 – 11 march 1870) was born at menkhoaneng ...,place of birth Lesotho,,1786 – 11 March 1870) was born at Menkhoaneng ...,0.987205,0.005485,0.00731,"[[-28.885555267334, 28.292499542236], [-29.533...","[[-29.5333337783815, 28.116665840149]]",0.0 km
7,edward ou () was born on 16 october 1980 in ta...,place of birth Taiwan,,Edward Ou () was born on 16 October 1980 in Ta...,0.987157,0.002633,0.01021,"[[24.5333337783815, 121.25833511352499]]","[[24.5333337783815, 121.25833511352499]]",0.0 km
8,"frank johnson goodnow (january 18, 1859 – nove...",place of birth Brooklyn,,"Frank Johnson Goodnow (January 18, 1859 – Nove...",0.987102,0.004501,0.008397,"[[40.658750534058, -73.971252441406], [40.7127...","[[40.658750534058, -73.971252441406]]",0.0 km
9,samir brahimi is an algerian boxer born on 17 ...,place of birth Algiers,,Samir Brahimi is an Algerian boxer born on 17 ...,0.987089,0.004305,0.008606,"[[36.753887176514, 3.0588889122009]]","[[36.753887176514, 3.0588889122009]]",0.0 km


In [51]:
df_1

Unnamed: 0,sentence,wikidata,label,raw_sentence,0_prob,1_prob,2_prob,sentence_centroid,claim_centroid,shortest
0,sid ahmed ghozali () (born 31 march 1937 in ma...,place of birth ouedjda,,Sid Ahmed Ghozali () (born 31 March 1937 in Ma...,0.015317,0.961332,0.023351,"[[34.861667633057, -1.7305555343628], [32.3500...",[],
1,"walid bidani (born 11 june 1994 in maghnia, al...",place of birth ouedjda,,"Walid Bidani (born 11 June 1994 in Maghnia, Al...",0.022689,0.948487,0.028825,"[[34.861667633057, -1.7305555343628], [32.3500...",[],
2,ahmed ben bella ( ; 25 december 1916 – 11 apri...,place of birth ouedjda,,Ahmed Ben Bella ( ; 25 December 1916 – 11 Apri...,0.027831,0.92056,0.051609,"[[32.3500003814695, 2.6083333492279]]",[],
3,"peter balakian (, born june 13, 1951) is an ar...",place of birth Teaneck,,"Peter Balakian (, born June 13, 1951) is an Ar...",0.075744,0.874762,0.049494,[],"[[40.890316009521, -74.011474609375]]",
4,"xiong xiling, or hsiung hsi-ling (simplified c...",place of birth fenghuang,,"Xiong Xiling, or Hsiung Hsi-ling (Simplified C...",0.097139,0.836282,0.066578,"[[24.5333337783815, 121.25833511352499]]",[],
5,elisabeth baldauf (born 3 august 1990) is an...,place of birth Egg,,Elisabeth Baldauf (born 3 August 1990) is an...,0.069777,0.78468,0.145542,[],[],
6,"tang feifan (; july 23, 1897 - september 30, 1...",place of birth Liling,,"Tang Feifan (; July 23, 1897 - September 30, 1...",0.066582,0.779012,0.154406,"[[39.916667938232, 116.38333129883]]",[],
7,gurmeet ram rahim singh insan (born 15 august ...,place of birth Shri gurusar modia,,Gurmeet Ram Rahim Singh Insan (born 15 August ...,0.174095,0.77277,0.053135,[],[],
8,patrick john miguel van aanholt (born 29 augus...,place of birth 's-Hertogenbosch,,Patrick John Miguel van Aanholt (born 29 Augus...,0.093064,0.768476,0.13846,"[[52.366664886475, 4.8833332061768]]",[],
9,jennifer tilly (born jennifer ellen chan; sept...,place of birth Harbor City,,Jennifer Tilly (born Jennifer Ellen Chan; Sept...,0.205799,0.724266,0.069936,[],"[[33.790000915527, -118.29694366455]]",


In [52]:
df_0.to_csv('./data/DBSpotlight/EN_0_top_50_db_centroid')
df_1.to_csv('./data/DBSpotlight/EN_1_top_50_db_centroid')