In [1]:
import os, sys, rdflib
from sklearn import metrics
import numpy as np, pandas as pd
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from SPARQLWrapper import SPARQLWrapper, JSON
import rdflib.plugins.sparql as sparql, matplotlib.pyplot as plt
sys.path.append(os.path.abspath("./util"))
from misc import *

plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [2]:
%%time
%store -r uri_type
if "uri_type" not in locals():
    dbpedia_type_graph = rdflib.Graph()
    dbpedia_type_graph.parse('./dataset/instance_types_en_uris_it.nt', format="nt" )
    dbpedia_type_graph.parse('./dataset/instance_types_en.nt', format="nt" )

CPU times: user 2.49 s, sys: 450 ms, total: 2.94 s
Wall time: 4.92 s


In [3]:
walks_path = "./dataset/embedding_5_7_dbpedia_3.9_infobox_properties_no_literal.nt.txt"

vocab_sequences = get_sequences(walks_path)
train_sequences = get_sequences(walks_path)

model = Word2Vec(min_count=1, window=5, negative=5, size=100)

model.build_vocab(vocab_sequences)
model.train(train_sequences)

9023728

In [4]:
%store -r uri_type
if "uri_type" not in locals():
    uri_type = dict()
    for uri in model.vocab:
        types = {row[0].n3() for row in get_types(uri, dbpedia_type_graph)}
        if len(types) > 0:
            uri_type[uri] = types
    %store uri_type

In [5]:
%%time

embeds = np.asarray([model[uri] for uri in uri_type])
uris   = np.asarray([uri for uri in uri_type])
labels = get_ground_truth(uris, uri_type) # np.asarray([1 if is_person(mid, mid2types) else 0 for mid in mid2types])
###
random_idx = np.random.choice(range(len(embeds)), len(embeds), replace=False)
split_point = len(embeds) * 70 / 100
training_idx, test_idx = random_idx[:split_point], random_idx[split_point:]

embeds_train, embeds_test = embeds[training_idx], embeds[test_idx]
uris_train,   uris_test   = uris[training_idx],   uris[test_idx]
labels_train, labels_test = labels[training_idx], labels[test_idx]

tsne = TSNE(n_components=2)
# %store -r twodims_dbpedia
# if "twodims_dbpedia" not in locals():
#     twodims_dbpedia = tsne.fit_transform(embeds)
#     %store twodims_dbpedia
# 
# ground_truth = get_ground_truth(mids, mid2types)
# colors = [get_color(i) for i in labels]

CPU times: user 2.33 s, sys: 269 ms, total: 2.6 s
Wall time: 2.88 s


In [6]:
# plt.rcParams["figure.figsize"] = [14, 10]
# x = [point[0] for point in twodims_dbpedia]
# y = [point[1] for point in twodims_dbpedia]
# plt.scatter(x, y, color=colors)
# plt.show()

In [None]:
%%time
ovo_rbf    = OneVsOneClassifier(SVC(gamma=2, C=1))
# ovo_forest = OneVsOneClassifier(RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1))
# ovr_rbf    = OneVsRestClassifier(SVC(gamma=2, C=1))
# ovr_forest = OneVsRestClassifier(RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1))

ovo_rbf.fit(embeds_train, labels_train)
# ovo_forest.fit(embeds_train, labels_train)
# ovr_rbf.fit(embeds_train, labels_train)
# ovr_forest.fit(embeds_train, labels_train)

ovo_rbf_pred    = np.array([ovo_rbf.predict([emb]) for emb in embeds_test])
# ovo_forest_pred = np.array([ovo_forest.predict([emb]) for emb in embeds_test])
# ovr_rbf_pred    = np.array([ovr_rbf.predict([emb]) for emb in embeds_test])
# ovr_forest_pred = np.array([ovr_forest.predict([emb]) for emb in embeds_test])

In [None]:
len(labels_test)

In [None]:
plt.rcParams["figure.figsize"] = [10, 7]

cnf_matrix = metrics.confusion_matrix(labels_test, ovo_rbf_pred)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[0, 1, 2, 3, 4, 5], title='Confusion matrix - RF')
plt.show()

In [None]:
%store -r twodims_dbpedia_test
if "twodims_dbpedia_test" not in locals():
    twodims_dbpedia_test = tsne.fit_transform(embeds_test)
    %store twodims_dbpedia_test

colors_true = [get_color(i) for i in labels_test]
colors_pred = [get_color(i) for i in ovo_rbf_pred]

plt.rcParams["figure.figsize"] = [20, 20]
fig, ax = plt.subplots(nrows=1,ncols=2)
x_test = [point[0] for point in twodims_dbpedia_test]
y_test = [point[1] for point in twodims_dbpedia_test]

plt.subplot(2, 2, 1)
plt.scatter(x_test, y_test, color=colors_true)

plt.subplot(2, 2, 2)
plt.scatter(x_test, y_test, color=colors_pred)
plt.show()

In [None]:
metrics_class_df = pd.concat([
    multiclass_metrics(labels_test, ovo_rbf_pred, average="weighted", name="OnevOne RBF SVM"),
    multiclass_metrics(labels_test, ovo_forest_pred, average="weighted", name="OnevOne Forest"),
    multiclass_metrics(labels_test, ovr_rbf_pred, average="weighted", name="OnevRest RBF SVM"),
    multiclass_metrics(labels_test, ovr_forest_pred, average="weighted", name="OnevRest Forest")
])

metrics_class_df