##Initialization

In [0]:
import threading, re, sys, os, time, csv, requests, random, json, tempfile, math, itertools, google.auth, urllib.request
import pandas as pd
import numpy as np
from urllib.request import urlopen 
from six.moves.urllib.request import urlopen
from SPARQLWrapper import SPARQLWrapper, JSON
from lxml import html
from bs4 import BeautifulSoup
from google.colab import drive
from google.cloud import storage
from google.colab import auth
from datetime import datetime
from six import BytesIO
from PIL import Image

sparql = SPARQLWrapper("http://dbpedia.org/sparql") 

#Define locations & mount Google Drive
directory = "drive/My Drive/ISE/dbo type hierarchy/"
drive.mount("drive", force_remount=True)

#Authentication & initialization Google Cloud
auth.authenticate_user()

with open('/content/adc.json', 'r') as f:
  auth_info = json.load(f)
credentials, project = google.auth.default()

client = storage.Client(credentials=credentials, project='ise-project-259623')
bucket = client.get_bucket('ise-bucket')

#Create Training Dataset

###Load 1k/10k/40k random resources for images


In [0]:
def load_resources_file(filename):
  with open(directory + filename) as file:
    resources = json.load(file)
  return resources

### Helper & Image transfer functions to Google Cloud Storage

In [0]:
def download_image(url,filename,ignore_small = True):
  #download one image from google storage bucket
  try:
    response = urlopen(url)
    image_data = response.read()
    image_data = BytesIO(image_data)
    pil_image = Image.open(image_data)
  except:
    print("Error in 'download image': " + url)
    return False

  #Check image size
  h, w = pil_image.size
  if ignore_small & (min(h,w) < 45):
    return False
  try:
    pil_image_rgb = pil_image.convert("RGB")
    pil_image_rgb.save('/tmp/'+filename, format="JPEG", quality=60)
    return True
  except:
    print("Error in 'download image': " + url)
    return False

def upload_image(filename,upload_path):
  #upload one image to google storage bucket
  blob = bucket.blob(upload_path+filename)
  blob.upload_from_filename('/tmp/'+filename)

def read_tsv(file_name, quotechar=None):
  #read tsv file
  with open(file_name, "r") as f:
    reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
    lines = []
    for line in reader:
      lines.append(line)
    return lines

def get_thumbnail_url(wikimedia_url):
  #load url of thumbnail image
  html_data = urlopen(wikimedia_url)
  bs = BeautifulSoup(html_data, 'html.parser')
  img_lines = bs.find_all('img', {'src':re.compile('.jpg|.jpeg|.JPEG|.JPG')})
  thumb_url = "http:" + img_lines[0]["src"]
  return thumb_url

def get_image_usage(wikimedia_url):
  #scrape value of how often image is used on Wikipedia for Tf-Idf Score
  pageContent = requests.get(wikimedia_url)
  tree = html.fromstring(pageContent.content)
  links_to_image = tree.xpath('//*[@id="mw-imagepage-section-linkstoimage"]/ul')
  img_usage = len(links_to_image[0])
  return img_usage

Create csv with input (img URL in bucket) and output (rdf_type) for training


In [0]:
def create_csv(prefix,export_path):
  blob_list = [b for b in bucket.list_blobs(prefix=prefix)]

  result_list = []

  for i in blob_list[:]:
    input_split = "gs://ise-bucket/" + i.name
    output_split = i.name.split("/")[-1].split("_")[0]
    result_list.append([input_split,output_split])
  df = pd.DataFrame(result_list,index = None, columns = ["Input","Output"])
  df.to_csv(export_path, index = False)
  return(df)

###Threading Wrapper

In [0]:
def get_imageurl_threading(rdf_type,resources,result,idx,limit,upload_path,img_per_article):
  url_count = 0
  print(rdf_type)
  for resource in resources[:]:
    
    #Chose scraping approach
    if img_per_article == 1:
      scraping_result = get_one_imageurl(resource)   #Scrape one image per Wikipedia article
    else:
      scraping_result = get_top_imageurl(resource,img_per_article)  #Scrape the top ranked of x images in Wikipedia article
    

    if not scraping_result == False:
      resource_name = re.sub('[\W]+', '', resource.split("/")[-1]) 
      filename = "{}_{}.jpg".format(rdf_type,resource_name)
      
      if(download_image(scraping_result,filename)):
        upload_image(filename,upload_path)
        url_count += 1
    
    if url_count%200 == 1:
      print("{}: {} images scraped from: {}".format(datetime.now(), url_count, rdf_type))
    
    if url_count%400 == 1:
      send_notification("{} images scraped".format(url_count), rdf_type)

    if url_count >= limit:
      break
  
  return True

### Get First Image URL

In [0]:
def get_one_imageurl(resource):
  entity_name = resource.split("/")[-1]
  wikipedia_url = "https://en.wikipedia.org/wiki/" + entity_name

  #Open Wikipedia URL
  try:
    html_data = urlopen(wikipedia_url)
  except:
    return False

  #Find images
  try:
    bs = BeautifulSoup(html_data, 'html.parser')
    image_soup = bs.find('img', {'src':re.compile('.jpg|.jpeg|.JPEG|.JPG')})
  except:
    return False

  #Check, if images were found
  if image_soup == None:
    return False

  #Remove too small images by wiki-info
  if "height" in image_soup:
    if int(image_soup["height"]) > 45:
      image = image_soup
    else:
      return False
  else:
    image = image_soup

  #Get reasonable standard wiki-thumbnail size of image
  try:
    filename = image['src'].split("/")[8]
    wikimedia_url = "https://en.wikipedia.org/wiki/File:" + filename
    imgurl = get_thumbnail_url(wikimedia_url)
    return imgurl
  except:
    try:
      filename = image['src'].split("/")[7]
      wikimedia_url = "https://en.wikipedia.org/wiki/File:" + filename
      imgurl = get_thumbnail_url(wikimedia_url)
      return imgurl
    except:
      #if thumbnail is not available, use original image 
      imgurl = "http:" + image['src']
      return imgurl

### Get Top Image URL (ranking approach)

In [0]:
def get_top_imageurl(resource,img_per_article):
  first_imageurls = get_first_imageurls(resource, img_per_article)
  if first_imageurls==False:
    return False
  
  imgurl_list, metadata = first_imageurls
  scores = calculate_tfidf_scores(len(imgurl_list),metadata)
  idx = np.argmax(scores)
  top_imageurl = imgurl_list[idx]
  if idx != 0:
    print("Image was reranked: " + top_imageurl + " " + str(metadata[idx]))
  return top_imageurl

Get upper X image URLs

In [0]:
def get_first_imageurls(resource,img_per_article):
  entity_name = resource.split("/")[-1]
  wikipedia_url = "https://en.wikipedia.org/wiki/" + entity_name
  imgurl_list = []
  metadata = []

  #Open Wikipedia URL
  try:
    html_data = urlopen(wikipedia_url)
  except:
    return False
  
  #Find all jpg images in Beautiful Soup
  try:
    bs = BeautifulSoup(html_data, 'html.parser')
    images = bs.find_all('img', {'src':re.compile('.jpg|.jpeg|.JPEG|.JPG')})
  except:
    return False

  #Remove small images
  images2 = [] # list of images larger than 45px
  for image in images:
    if "height" in image:
      if int(image["height"]) > 45:
        images2.append(image)
    else:
      images2.append(image)

  if len(images2) == 0:
    return False
  else:
    #get image URLs
    for img_number, image in enumerate(images2):
      if len(imgurl_list) < img_per_article:
        try:
          filename = image['src'].split("/")[8]
          wikimedia_url = "https://en.wikipedia.org/wiki/File:" + filename
          imgurl = get_thumbnail_url(wikimedia_url)
          img_usage = get_image_usage(wikimedia_url)
          imgurl_list.append(imgurl)
          metadata.append([img_number + 1,img_usage])
        except:
          try:
            filename = image['src'].split("/")[7]
            wikimedia_url = "https://en.wikipedia.org/wiki/File:" + filename
            imgurl = get_thumbnail_url(wikimedia_url)
            img_usage = get_image_usage(wikimedia_url)
            imgurl_list.append(imgurl)
            metadata.append([img_number + 1,img_usage])
          except:
            print(image['src'])
            pass
  if len(imgurl_list) == 0:
    return False
  return imgurl_list, metadata

Calculate top image via ranking

In [0]:
def calculate_tfidf_scores(number_of_images, metadata):
  # Get tf-idf Scores
  # score = (tf)*(idf)
  if number_of_images != 1:
    scores = [(1/(math.sqrt(int(i[0]))))*(math.log(5968914/int(i[1]))) for i in metadata] 
  else:
    scores = [math.log(5968914/int(metadata[0][1]))]
  return scores

### Scraping function

In [0]:
def run_scraping(random_resources_file,start_type,no_of_types,limit,img_per_article):
  # run all
  random_resources = load_resources_file(random_resources_file)
  dataset_name = name_prefix + "_" + str(img_per_article) + "-img_" + str(limit) + "-ent_" + str(no_of_types) + "-type"
  upload_path = "efficientnet/" + dataset_name + "/" 
  types_list = list(random_resources.keys())[start_type:start_type + no_of_types]
  results = [{} for rdf_type in types_list]
  threads = []

  start = datetime.now()
  print (dataset_name)
  print(start)
  for idx, rdf_type in enumerate(types_list):
    # start one thread per rdf type.
    process = threading.Thread(target=get_imageurl_threading, args=(rdf_type,random_resources[rdf_type][:],results,idx,limit,upload_path,img_per_article))
    process.start()
    threads.append(process)
  # pause execution on the main thread by 'joining' all of started threads.
  for i, process in enumerate(threads):
    print("{} - {} - {}".format(i,process,datetime.now()))
    process.join()

  end = datetime.now()
  print("{} to scrape {} types.".format(end-start, no_of_types))

  #Training csv
  create_csv(upload_path,directory + dataset_name + "_training.csv") 

##RUN Scraping


Set Parameters

In [0]:
name_prefix = "dataset-20-01-31"  #@param {type:"string"}
start_type =  0#@param {type:"number"}
no_of_types =  20#@param ["100", "20"] {type:"raw", allow-input: true}
entity_limit = 10000 #@param ["10000", "50000"] {type:"raw", allow-input: true}
random_resources_file = "top100_dbo_2000_random_resources_specific" #@param ["top100_dbo_10000_random_resources.txt", "top100_dbo_1000_random_resources.txt", "top20_dbo_50000_random_resources.txt", "top100_dbo_50000_random_resources.txt", "top100_dbo_2000_random_resources_specific"]
images_per_article =  1           #@param {type:"integer"}

print(name_prefix + "_" + str(images_per_article) + "-img_" + str(entity_limit) + "-ent_" + str(no_of_types) + "-type")

**Run Scraping**


In [0]:
run_scraping(random_resources_file,start_type,no_of_types,entity_limit,images_per_article)