# CLASSIFICATION OF DBPEDIA DESCRIPTIONS

In [45]:
from __future__ import print_function

import tensorflow as tf

import os
import sys
import numpy as np
import pickle5 as pickle
import tempfile

from keras.models import Model
from keras.layers import Input

from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph, IndexedArray
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import pandas as pd
import matplotlib.pyplot as plt

import logging, csv
from determine_topics import *

np.random.seed(100)

MAX_NUM_WORDS = 20000
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.2

NUM_RELATIONS_PER_CLUSTER = 67
NUM_ENTITIES_PER_CLUSTER = 400
NUM_CLUSTERS = 20


class HashTable:

    # Create empty bucket list of given size
    def __init__(self, size):
        self.size = size
        self.hash_table = self.create_buckets()

    def create_buckets(self):
        return [[] for _ in range(self.size)]

    # Insert values into hash map
    def set_val(self, key, val):

        # Get the index from the key
        # using hash function
        hashed_key = hash(key) % self.size

        # Get the bucket corresponding to index
        bucket = self.hash_table[hashed_key]

        found_key = False
        for index, record in enumerate(bucket):
            record_key, record_val = record

            # check if the bucket has same key as
            # the key to be inserted
            if record_key == key:
                found_key = True
                break

        # If the bucket has same key as the key to be inserted,
        # Update the key value
        # Otherwise append the new key-value pair to the bucket
        if found_key:
            bucket[index] = (key, val)
        else:
            bucket.append((key, val))

    # Return searched value with specific key
    def get_val(self, key):

        # Get the index from the key using
        # hash function
        hashed_key = hash(key) % self.size

        # Get the bucket corresponding to index
        bucket = self.hash_table[hashed_key]

        found_key = False
        for index, record in enumerate(bucket):
            record_key, record_val = record

            # check if the bucket has same key as
            # the key being searched
            if record_key == key:
                found_key = True
                break

        # If the bucket has same key as the key being searched,
        # Return the value found
        # Otherwise indicate there was no record found
        if found_key:
            return record_val
        else:
            raise ValueError('No record found.')

    # Remove a value with specific key
    def delete_val(self, key):

        # Get the index from the key using
        # hash function
        hashed_key = hash(key) % self.size

        # Get the bucket corresponding to index
        bucket = self.hash_table[hashed_key]

        found_key = False
        for index, record in enumerate(bucket):
            record_key, record_val = record

            # check if the bucket has same key as
            # the key to be deleted
            if record_key == key:
                found_key = True
                break
        if found_key:
            bucket.pop(index)
        return

    # To print the items of hash map
    def __str__(self):
        return "".join(str(item) for item in self.hash_table)



def pickler(path, pkl_name, obj):
    with open(os.path.join(path, pkl_name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def unpickler(path, pkl_name):
    with open(os.path.join(path, pkl_name), 'rb') as f:
        obj = pickle.load(f)
    return obj

def get_labels(data):
    labels = []
    for d in data:
        labels.append(d['label']) if d['label'] not in labels else None

    return labels


def get_x_and_y(data):
    x, y = [], []
    for d in data:
        for dd in d['data']:
            tmp = dd['text'].replace('\n', '').replace('_', '')  # clean
            x.append({'label': d['label'], 'dbpedia_uri': dd['dbpedia_uri'], 'context_data': dd['context_data'], 'text': tmp, 'graph': dd['graph']}) if len(tmp) > 0 else None
            y.append(d['label']) if len(tmp) > 0 else None

    return x, y


def get_label_index(label):
    return [index for index, _label in enumerate(unique_labels) if label == _label][0]

def get_context_data(data):
    for d in data:
        for dd in d['data']:
            dd['context_data'] = [c for c in set(find_clusters(dd['context_graph']['nodes']))]


def get_context_data_from_nodes(nodes):
    return find_clusters([node.replace('http://dbpedia.org/resource/','').replace('_',' ') for node in nodes])


def find_clusters(arr):
    result = []
    for el in arr:
        all_mappings = node_cluster_map.get_val(el) if el in node_cluster_mapping_with_count else []
        filtered_mappings = [e[0] for e in all_mappings[:min(len(all_mappings), 3)]]
        result.extend(filtered_mappings)

    return result


### PROCESS BATCH FUNCTIONS

In [33]:
def get_context_data_from_dbpedia_properties_v2(properties, dbpedia_uris):
    context_data = [['Other'] for i in range(len(dbpedia_uris))]

    logging.info("Extracting context data from dbpedia properties...")
    for i,property in enumerate(properties):
        try:
            index = dbpedia_uris.index(property['source'])
            context_data[index] = get_context_data_from_nodes([target for target in property['targets'] if target != 'http://www.w3.org/2002/07/owl#Thing'])
        except ValueError:
            pass

    pickler('data','context_data_all.pkl',context_data)

    return context_data

def process_batch_v2(file_index, tfidf, model):
    logging.info('Processing file %d...' % (file_index))
    with open(os.path.join('data/dbpedia_properties', "{}.pkl".format(file_index)), 'rb') as f:
        properties = pickle5.load(f)

    new_texts,new_dbpedia_uris,new_properties = [], [], []
    for i,property in enumerate(properties):
        try:
            # logging.info('Processing property (%d-%d)' % (file_index,i)) if i % 1000 == 0 else None
            index = dbpedia_uris_map.get_val(property['source'])
            new_texts.append(texts[index])
            new_properties.append(property)
            new_dbpedia_uris.append(property['source'])
        except Exception:
            pass

    context_data_input = get_context_data_from_dbpedia_properties_v2(new_properties, new_dbpedia_uris)
    context_data_input = pd.DataFrame(context_data_input).values

    logging.info('Encoding context data...')
    for c in context_data_input:
        for i, v in enumerate(c):
            c[i] = 'Other' if not v else v

    cluster_names = unique_labels + ['None']

    BATCH_SIZE, result_labels = 10000, []
    for i in range(0,len(new_texts), BATCH_SIZE):
        logging.info('Processing batch %d...' % i)
        batch_texts = new_texts[i: min(i + BATCH_SIZE, len(new_texts))]
        batch_context_data_input = context_data_input[i: min(i + BATCH_SIZE, len(new_texts))]

        context_data_encodings = m.predict(batch_context_data_input)

        for encoding in context_data_encodings:
            encoding[78] = 0.0  # other

        x_vectors = tfidf.transform(batch_texts)
        x_svc = np.concatenate((x_vectors.toarray(), context_data_encodings), axis=1)

        logging.info("Running the algorithm... %d" % (file_index))
        batch_result_labels = model.predict(x_svc)
        result_labels.extend(batch_result_labels)

    logging.info("Writing results...")
    with open('determine_topics_results_v2/{}.csv'.format(file_index), 'w', encoding='UTF8') as f:
        writer = csv.writer(f)
        header = ['Dbpedia Uri', 'Topic', 'Category', 'SubCategory']
        writer.writerow(header)

        i = 0
        for uri, label in zip(new_dbpedia_uris, result_labels):
            i += 1

            splitted_label = label.split('-')
            topic_label = splitted_label[0]
            category_label = splitted_label[1] if len(splitted_label) > 1 else None
            subcategory_label = splitted_label[2] if len(splitted_label) > 2 else None

            topic = [topic for topic in topics if topic['name'] == topic_label]
            topic = topic[0] if len(topic) > 0 else None
            category = [c for c in topic['categories'] if c['name'] == category_label]
            category = category[0] if len(category) > 0 else None
            subcategory = None

            result_topic = topic['name'] if topic else topic_label
            result_category = category['name'] if category else category_label
            result_subcategory = subcategory['name'] if subcategory else subcategory_label
            writer.writerow([uri, result_topic, result_category, result_subcategory])


### LOAD DATA

In [3]:
import pickle5 as pickle5
import os
with open(os.path.join('data', 'classification_data_with_graphs_v5.pkl'), 'rb') as f:
    data = pickle5.load(f)
with open(os.path.join('data', 'node_cluster_mapping_v5_with_count.pkl'), 'rb') as f:
    node_cluster_mapping_with_count = pickle5.load(f)

In [4]:
node_cluster_map = HashTable(len(node_cluster_mapping_with_count.keys()))
for key in node_cluster_mapping_with_count.keys():
    mapping = node_cluster_mapping_with_count[key]
    node_cluster_map.set_val(key,mapping)

get_context_data(data)
x,y = get_x_and_y(data)
unique_labels = get_labels(data)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
x_val, y_val,x_test, y_test = x_test[:500], y_test[:500], x_test[500:], y_test[500:]
y_train_int, y_test_int = [get_label_index(label) for label in y_train],[get_label_index(label) for label in y_test]
x_train_text,x_val_text,x_test_text = [xx['text'] for xx in x_train],[xx['text'] for xx in x_val], [xx['text'] for xx in x_test]
x_train_context, x_val_context, x_test_context = [xx['context_data'] for xx in x_train],[xx['context_data'] for xx in x_val], [xx['context_data'] for xx in x_test]

### LOAD DBPEDIA TEXTS

In [9]:
dbpedia_uris, texts = read_dbpedia()

logging.info('Processing dbpedia uris...')
dbpedia_uris_map = HashTable(len(dbpedia_uris))
for i,dbpedia_uri in enumerate(dbpedia_uris):
    logging.info('Processing dbpediauri %d...' % i) if i % 100000 == 0 else None
    dbpedia_uris_map.set_val(dbpedia_uri, i)

2023-01-22 13:40:54,935:INFO:Reading dbpedia articles...
2023-01-22 13:43:27,738:INFO:Processing dbpedia uris...
2023-01-22 13:43:31,154:INFO:Processing dbpediauri 0...
2023-01-22 13:43:31,253:INFO:Processing dbpediauri 100000...
2023-01-22 13:43:31,354:INFO:Processing dbpediauri 200000...
2023-01-22 13:43:31,455:INFO:Processing dbpediauri 300000...
2023-01-22 13:43:31,558:INFO:Processing dbpediauri 400000...
2023-01-22 13:43:31,663:INFO:Processing dbpediauri 500000...
2023-01-22 13:43:31,765:INFO:Processing dbpediauri 600000...
2023-01-22 13:43:31,869:INFO:Processing dbpediauri 700000...
2023-01-22 13:43:31,973:INFO:Processing dbpediauri 800000...
2023-01-22 13:43:32,084:INFO:Processing dbpediauri 900000...
2023-01-22 13:43:32,194:INFO:Processing dbpediauri 1000000...
2023-01-22 13:43:32,304:INFO:Processing dbpediauri 1100000...
2023-01-22 13:43:32,415:INFO:Processing dbpediauri 1200000...
2023-01-22 13:43:32,526:INFO:Processing dbpediauri 1300000...
2023-01-22 13:43:32,638:INFO:Proce

### TRAINING

In [19]:
cluster_names = unique_labels + ['None']

context_input = Input(shape=(1,), dtype='string', name='graph_input')
context_layer = tf.keras.layers.StringLookup(vocabulary=cluster_names, output_mode='multi_hot')(context_input)

m = tf.keras.models.Model(inputs=[context_input], outputs=context_layer)

context_data_train_input = pd.DataFrame(x_train_context).values

for c in context_data_train_input:
    for i, v in enumerate(c):
        c[i] = 'Other' if not v else v

context_train_encodings = m.predict(context_data_train_input)

for encoding in context_train_encodings:
    encoding[78] = 0.0 # other

vectorizer = TfidfVectorizer(max_features=10000)
tfidf = vectorizer.fit(x_train_text)
x_train_vectors = tfidf.transform(x_train_text)
x_train_svc = np.concatenate((x_train_vectors.toarray(), context_train_encodings), axis=1)

model = LinearSVC()
model.fit(x_train_svc, y=y_train)




LinearSVC()

In [20]:
model_text = LinearSVC()
model_text.fit(x_train_vectors, y=y_train)

LinearSVC()

### TEST

In [111]:
with open(os.path.join('data/dbpedia_properties', "{}.pkl".format(1)), 'rb') as f:
    properties = pickle5.load(f)

new_texts,new_dbpedia_uris,new_properties = [], [], []
for i,property in enumerate(properties):
    try:
        logging.info('Processing property (%d-%d)' % (file_index,i)) if i % 1000 == 0 else None
        index = dbpedia_uris_map.get_val(property['source'])
        new_texts.append(texts[index])
        new_properties.append(property)
        new_dbpedia_uris.append(property['source'])
    except Exception:
        pass

context_data_input = get_context_data_from_dbpedia_properties_v2(new_properties, new_dbpedia_uris)
context_data_input = pd.DataFrame(context_data_input).values

logging.info('Encoding context data...')
for c in context_data_input:
    for i, v in enumerate(c):
        c[i] = 'Other' if not v else v

cluster_names = unique_labels + ['None']

2023-01-22 17:04:36,625:INFO:Extracting context data from dbpedia properties...
2023-01-22 17:05:06,476:INFO:Encoding context data...


In [13]:
BATCH_SIZE, result_labels = 10000, []
i = 0
logging.info('Processing batch %d...' % i)
batch_texts = new_texts[i: min(i + BATCH_SIZE, len(new_texts))]
batch_context_data_input = context_data_input[i: min(i + BATCH_SIZE, len(new_texts))]

context_data_encodings = m.predict(batch_context_data_input)

for encoding in context_data_encodings:
    encoding[78] = 0.0  # other

x_vectors = tfidf.transform(batch_texts)
x_svc = np.concatenate((x_vectors.toarray(), context_data_encodings), axis=1)

logging.info("Running the algorithm...")
batch_result_labels = model.predict(x_svc)
result_labels.extend(batch_result_labels)

2023-01-22 13:46:09,409:INFO:Processing batch 0...
2023-01-22 13:46:19,428:INFO:Running the algorithm...


In [27]:
i = 5
new_texts[i], new_properties[i], context_data_input[i]

('!Women Art Revolution is a 2010 documentary film directed by Lynn Hershman Leeson and distributed by Zeitgeist Films. It tracks the feminist art movement over 40 years through interviews with artists, curators, critics, and historians."@e',
 {'source': 'http://dbpedia.org/resource/!Women_Art_Revolution',
  'targets': ['http://dbpedia.org/resource/Lynn_Hershman_Leeson',
   'http://dbpedia.org/resource/Zeitgeist_Films',
   'http://dbpedia.org/resource/Lynn_Hershman_Leeson',
   'http://dbpedia.org/resource/Carrie_Brownstein',
   'http://dbpedia.org/resource/Cláudia_Pascoal']},
 array(['Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
        'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
        'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
        'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
        'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
        'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 

In [17]:
for dbpedia_uri, label in zip(new_dbpedia_uris[:50], batch_result_labels[:50]):
    print(dbpedia_uri,label)

http://dbpedia.org/resource/!!!_(album) Art-Music
http://dbpedia.org/resource/!Action_Pact! Art-Music
http://dbpedia.org/resource/!Hero_(album) Religion
http://dbpedia.org/resource/!Oka_Tokat Mythology
http://dbpedia.org/resource/!PAUS3 Art-Music
http://dbpedia.org/resource/!Women_Art_Revolution Science-Social Sciences
http://dbpedia.org/resource/$1.99_Romances Art-Fashion
http://dbpedia.org/resource/$100,000_Fortune_Hunt Philosophy
http://dbpedia.org/resource/$24_in_24 Culture-Country
http://dbpedia.org/resource/$25_Million_Dollar_Hoax Media-TV Series & Shows
http://dbpedia.org/resource/$9.99 Nature-Animal
http://dbpedia.org/resource/$O$ Art-Fashion-Designer
http://dbpedia.org/resource/$_(Mark_Sultan_album) Art-Photography-Photographer
http://dbpedia.org/resource/$ell_Out Art-Music
http://dbpedia.org/resource/$h*!_My_Dad_Says Art-Cinema
http://dbpedia.org/resource/$pent Art-Theatre
http://dbpedia.org/resource/%22900%22,_Cahiers_d'Italie_et_d'Europe Science-Social Sciences
http://dbped

In [130]:
node_cluster_map.get_val('Netherlands')

[('Art-Photography-Photographer', 13),
 ('Art-Painting-Artist', 12),
 ('Art-Sculpting-Artist', 11),
 ('Media-Documentary', 7),
 ('Art-Painting', 5),
 ('Art-Literature-Writer', 5),
 ('Technology-Video Game', 5),
 ('Media-TV Series & Shows', 5),
 ('Culture-Historical Figure', 4),
 ('Sports', 4),
 ('Transportation-Railway', 4),
 ('Art-Cinema', 3),
 ('Art-Cinema-Actor', 3),
 ('Art-Fashion-Designer', 3),
 ('Science-Politics', 3),
 ('Media-News', 3),
 ('Military-Aviation', 3),
 ('Transportation-Land', 3),
 ('Transportation-Naval', 3),
 ('Art-Theatre-Actor', 2),
 ('Art-Fashion', 2),
 ('Art-Fashion-Model', 2),
 ('Art-Dance-Dancer', 2),
 ('Technology-Electronics', 2),
 ('Religion', 2),
 ('Mythology', 2),
 ('Media', 2),
 ('Military-Weapon', 2),
 ('Transportation-Aviation', 2),
 ('Art-Music', 1),
 ('Art-Music-Instrument', 1),
 ('Art-Literature', 1),
 ('Art-Dance', 1),
 ('Science-Physics', 1),
 ('Science-Mathematics', 1),
 ('Science-Agriculture', 1),
 ('Science-Archeology', 1),
 ('Science-Antropol

In [None]:
dbpedia_uri = "https://dbpedia.org/page/Eilema_tricolorana"
index = [i for i, uri in enumerate(dbpedia_uris) if uri == dbpedia_uri][0]
index, len(result_labels)
properties_ = [p for p in properties if p['source'] == dbpedia_uri]
# properties_,get_context_data_from_dbpedia_properties_v2(properties_, [dbpedia_uri])
properties_,get_context_data_from_nodes([p['targets'] for p in properties if p['source'] == dbpedia_uri][0])

### RUN ON BATCHES

In [34]:
for i in range(1,39):
    process_batch_v2(i, tfidf, model)

2023-01-22 14:46:39,157:INFO:Processing file 1...
2023-01-22 14:46:39,549:INFO:Extracting context data from dbpedia properties...
2023-01-22 14:47:03,854:INFO:Encoding context data...
2023-01-22 14:47:05,387:INFO:Processing batch 0...
2023-01-22 14:47:13,043:INFO:Running the algorithm... 1
2023-01-22 14:47:13,601:INFO:Processing batch 10000...
2023-01-22 14:47:22,531:INFO:Running the algorithm... 1
2023-01-22 14:47:22,933:INFO:Processing batch 20000...
2023-01-22 14:47:30,709:INFO:Running the algorithm... 1
2023-01-22 14:47:31,055:INFO:Processing batch 30000...
2023-01-22 14:47:39,926:INFO:Running the algorithm... 1
2023-01-22 14:47:40,313:INFO:Processing batch 40000...
2023-01-22 14:47:44,773:INFO:Running the algorithm... 1
2023-01-22 14:47:45,021:INFO:Writing results...
2023-01-22 14:47:45,383:INFO:Processing file 2...
2023-01-22 14:47:46,998:INFO:Extracting context data from dbpedia properties...
2023-01-22 14:48:01,803:INFO:Encoding context data...
2023-01-22 14:48:02,546:INFO:Proc

### CONCAT BATCH RESULTS

In [36]:
import csv
concatenated,header_row = [], None
for i in range(1,39):
    print('Reading file %d' % i)
    with open('determine_topics_results_v2/{}.csv'.format(i)) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                header_row = row
                line_count += 1
            else:
                concatenated.append(row)
                line_count += 1

with open('determine_topics_results_v2/determine_topics_results_v2.csv'.format(i),mode='w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',')
    csv_writer.writerow(header_row)
    for row in concatenated:
        csv_writer.writerow(row)

Reading file 1
Reading file 2
Reading file 3
Reading file 4
Reading file 5
Reading file 6
Reading file 7
Reading file 8
Reading file 9
Reading file 10
Reading file 11
Reading file 12
Reading file 13
Reading file 14
Reading file 15
Reading file 16
Reading file 17
Reading file 18
Reading file 19
Reading file 20
Reading file 21
Reading file 22
Reading file 23
Reading file 24
Reading file 25
Reading file 26
Reading file 27
Reading file 28
Reading file 29
Reading file 30
Reading file 31
Reading file 32
Reading file 33
Reading file 34
Reading file 35
Reading file 36
Reading file 37
Reading file 38


### Concatenating v1 and v2

In [52]:
v1, v2, header_row = [], [], None
print('Processing v1...')
with open('determine_topics_results_v2/determine_topics_results_v1.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            header_row = row
            line_count += 1
        else:
            v1.append(row)
            line_count += 1

print('Processing v2...')
with open('determine_topics_results_v2/determine_topics_results_v2.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            header_row = row
            line_count += 1
        else:
            v2.append(row)
            line_count += 1

print('Creating index map...')
v2_dbpedia_uri_map = HashTable(len(v2))
for i,row in enumerate(v2):
    v2_dbpedia_uri_map.set_val(row[0],i)

print('Concatenating v1 and v2...')
concatenated = []
for i,row1 in enumerate(v1):
    print('Processing row %d' % i) if i % 1000000 == 0 else None
    try:
        index = v2_dbpedia_uri_map.get_val(row1[0])
        concatenated.append(v2[index])
    except ValueError:
        concatenated.append(row1)

print('Sorting...')
concatenated = sorted(concatenated, key=lambda row: row[0])

print('Writing results...')
with open('determine_topics_results_v2/determine_topics_results_v2_concatenated.csv'.format(i),mode='w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',')
    csv_writer.writerow(header_row)
    for row in concatenated:
        csv_writer.writerow(row)

Processing v1...
Processing v2...
Creating index map...
Concatenating v1 and v2...
Processing row 0
Processing row 1000000
Processing row 2000000
Processing row 3000000
Processing row 4000000
Processing row 5000000
Sorting...
Writing results...


In [77]:
by_category = [{'label': c, 'rows': []} for c in cluster_names]
for row in concatenated:
    category_name = row[1]
    category_name += '-' + row[2] if row[2] != '' else ''
    category_name += '-' + row[3] if row[3] != '' else ''
    category = [c for c in by_category if c['label'] == category_name]
    if len(category) > 0:
        category = category[0]
        category['rows'].append(row)

In [80]:
for category in by_category:
    with open('determine_topics_results_v2/by_category/{}.csv'.format(category['label']),mode='w') as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=',')
        csv_writer.writerow(header_row)
        for row in category['rows']:
            csv_writer.writerow(row)

In [91]:
def get_instance_of(dbpedia_uris):
    results, meta = db.cypher_query('MATCH(e:Entity)-[rel:`https://www.wikidata.org/wiki/Property:P31`]-(e2:Entity) ' 
                                    ' where e.dbpedia_uri in $dbpedia_uris return e.dbpedia_uri, e2.dbpedia_uri, e2.wikidata_id, e2.name', {'dbpedia_uris': dbpedia_uris})
    
    before_uri, instances = None, []
    for row in results:
        instanceof_type = 'dbpedia' if row[1] else 'name' if row[3] else 'wikidata'
        instanceof = row[1] if row[1] else row[3] if row[3] else row[2]
        instances.append((before_uri, instanceof_type, instanceof))
        
    return instances

In [103]:
index = [i for i, uri in enumerate(dbpedia_uris) if uri == "http://dbpedia.org/resource/'s-Graveland"][0]
index, len(result_labels)

(380066, 10000)

In [98]:
by_category[0]['rows'][:10]

[['http://dbpedia.org/resource/%22As_the_Old_Sing,_So_Pipe_the_Young%22_(Jan_Steen)',
  'Art',
  'Painting',
  ''],
 ['http://dbpedia.org/resource/%22Holy...%22', 'Art', 'Painting', ''],
 ["http://dbpedia.org/resource/'Tronie'_of_a_Young_Man_with_Gorget_and_Beret",
  'Art',
  'Painting',
  ''],
 ["http://dbpedia.org/resource/'s-Graveland", 'Art', 'Painting', ''],
 ["http://dbpedia.org/resource/'s-Gravenpolder", 'Art', 'Painting', ''],
 ["http://dbpedia.org/resource/'s-Heer_Abtskerke", 'Art', 'Painting', ''],
 ["http://dbpedia.org/resource/'s-Heer_Arendskerke", 'Art', 'Painting', ''],
 ["http://dbpedia.org/resource/'s-Heer_Hendrikskinderen",
  'Art',
  'Painting',
  ''],
 ["http://dbpedia.org/resource/'s-Heerenberg", 'Art', 'Painting', ''],
 ["http://dbpedia.org/resource/'s-Heerenhoek", 'Art', 'Painting', '']]

In [132]:
for category in by_category:
    print('Processing category %s...' % category['label'])
    rows = []
    with open('determine_topics_results_v2/by_category/{}.csv'.format(category['label']),mode='r') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                header_row = row
                line_count += 1
            else:
                rows.append(row)
                line_count += 1
        
        instances, BATCH_SIZE = [], 100
        for i in range(0,len(rows),BATCH_SIZE):
            batch = [row[0] for row in rows[i: min(i + BATCH_SIZE, len(rows))]]
            instances.extend(get_instance_of(batch))

        instances_ = [instance[1] + '-' + instance[2] for instance in instances]
        counts = [(instance,instances_.count(instance)) for instance in set(instances_)]
        counts = sorted(counts, key=lambda c: c[1], reverse=True)
        
        print('Writing results...')
        
    with open('determine_topics_results_v2/instance_counts/{}.csv'.format(category['label']),mode='w') as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=',')
        csv_writer.writerow(['Instance', 'Count'])
        for row in counts:
            csv_writer.writerow(row)
                         

Processing category Art-Painting...
Writing results...
Processing category Art-Painting-Artist...
Writing results...
Processing category Art-Sculpting...
Writing results...
Processing category Art-Sculpting-Artist...
Writing results...
Processing category Art-Music...
Writing results...
Processing category Art-Music-Instrument...
Writing results...
Processing category Art-Cinema...
Writing results...
Processing category Art-Cinema-Actor...
2023-01-22 22:41:21,317:ERROR:Failed to read from defunct connection ResolvedIPv4Address(('151.106.35.64', 7687)) (IPv4Address(('151.106.35.64', 7687)))
2023-01-22 22:41:21,384:ERROR:Unable to retrieve routing information


ServiceUnavailable: Unable to retrieve routing information