##Initialization

In [0]:
!pip install SPARQLWrapper

import threading, re, sys, os, time, csv, requests, random, json, tempfile, math, itertools, google.auth, urllib.request
import pandas as pd
import numpy as np
from urllib.request import urlopen 
from six.moves.urllib.request import urlopen
from SPARQLWrapper import SPARQLWrapper, JSON
from lxml import html
from bs4 import BeautifulSoup
from google.colab import drive
from google.cloud import storage
from google.colab import auth
from datetime import datetime
from six import BytesIO
from PIL import Image

sparql = SPARQLWrapper("http://dbpedia.org/sparql") 

#Define locations & mount Google Drive
directory = "drive/My Drive/ISE/dbo Classes/"
drive.mount("drive", force_remount=True)

#Authentication & initialization Google Cloud
auth.authenticate_user()

with open('/content/adc.json', 'r') as f:
  auth_info = json.load(f)
credentials, project = google.auth.default()

client = storage.Client(credentials=credentials, project='ise-project-259623')
bucket = client.get_bucket('ise-bucket')

#Create Training Dataset

##Load dbo classes and entities
*Only needs to be executed in the firtst time*

Read dbo classes list


In [0]:
dbo_classes = []
with open(directory + "dbo Ontology Classes List.txt", "r") as f:
  for l in f:
    dbo_classes.append(l.replace("\n",""))  

Check number of resources in dbo classes via Sparql


In [0]:
entity_counts = []

# Sparql request
for dbo_class in dbo_classes:
  query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
  select distinct count(?entity)
  where {
  ?entity rdf:type dbo:""" + dbo_class + "}"
  sparql.setQuery(query)
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()
  count = results["results"]["bindings"][0]["callret-0"]["value"]
  entity_counts.append((dbo_class,int(count)))
  if len(entity_counts)%100==0:
    print(len(entity_counts))

In [0]:
class_counts = pd.concat([pd.DataFrame([i], columns = ["class","entity_count"]) for i in entity_counts], ignore_index=True)
class_counts = class_counts.sort_values(by="entity_count",ascending = False)
class_counts.to_csv(directory + "dbo_class_entity_counts.tsv",sep="\t",index=False)

---

##**Get X random resources** for top 100 classes

 Select top 100 classes

In [0]:
class_counts = pd.read_csv(directory + "dbo_class_entity_counts.tsv",sep="\t")

Filtering dbo classes...
* containing many resources without images (only <5% images)
* that are too general to infere ("Image","Agent",...)


In [0]:
rejected_classes = ["Image", "Agent","CareerStation", "OrganisationMember","SportsSeason","SportsEvent",
                    "SportsTeamMember","SportsTeamSeason","TimePeriod", "NCAATeamSeason", "FootballLeagueSeason", 
                    "MotorsportSeason", "Engine", "AutomobileEngine", "RadioStation", "PersonFunction",
                    "SoccerManager", "AmericanFootballPlayer"] # Remove unnecessary dbo classes from list

In [0]:
class_filter = []
for c in class_counts["class"]:
  if c in rejected_classes:
    class_filter.append(False)
  else:
    class_filter.append(True)
class_counts = class_counts[class_filter]
top100_dbo_classes = list(class_counts["class"][:100])
top200_dbo_classes = list(class_counts["class"][:200])

In [0]:
top100_dbo_classes

Up to 10k resources:

In [0]:
def get_random_resources(number, top_classes):
  rand_resources = {}
  for dbo_class in top_classes:
    sparql = SPARQLWrapper("http://dbpedia.org/sparql") 
    query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
      select distinct ?entity
      where {
      ?entity rdf:type dbo:""" + dbo_class + "} ORDER BY RAND() LIMIT " + str(number)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    rand_resources[dbo_class] = [str(i["entity"]["value"]) for i in results["results"]["bindings"]]
    if len(rand_resources)%10 == 0:
      print(len(rand_resources))

  with open(directory + "top100_dbo_" + str(number) + "_random_resources.txt", 'w') as file:
      file.write(json.dumps(rand_resources))

More than 10k resources

In [0]:
def get_more_random_resources(number, top_n_classes, n):
  rand_resources = {}
  for i,dbo_class in enumerate(top_n_classes[:n]):
    print(str(i) + ": " + dbo_class)
    sparql = SPARQLWrapper("http://dbpedia.org/sparql") 
    results = set()
    check_lengths = []
    while len(results) < number:
      request_limit = max(0,min(10000,number) - random.randint(0,100))
      query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        select distinct ?entity
        where {
        ?entity rdf:type dbo:""" + str(dbo_class) + "} ORDER BY RAND() LIMIT " + str(request_limit)
      sparql.setQuery(query)
      sparql.setReturnFormat(JSON)
      results_json = sparql.query().convert()
      results_list = [str(i["entity"]["value"]) for i in results_json["results"]["bindings"]]
      results.update(results_list)
      check_lengths.append(len(results))
      print(len(results))
      try:
        if check_lengths[-4] == check_lengths[-1]:
          break
      except:
        pass
    rand_resources[dbo_class] = list(results)
  with open(directory + "top" + str(n) + "_dbo_" + str(number) + "_random_resources.txt", 'w') as file:
      file.write(json.dumps(rand_resources))

###**Run** random resource collection

In [0]:
number_of_resources = 50000
get_more_random_resources(number_of_resources,top100_dbo_classes,100)
#get_random_resources(number_of_resources,top100_dbo_classes) #up to 10k