## Tab2Onto: Unsupervisied Semantification Of FB15k-237

In [None]:
import pickle
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import os 
os.makedirs('../results', exist_ok=True) 

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
os.chdir('../../')

from clustering_evaluation import ClusterPurity
evaluator=ClusterPurity()

### a) Data Preprocessing:
We skip the data preprocessing step, since FB15k-237 dataset in already linked data format (RDF triples). This step is only required, if you have an input tabular data.

In [None]:
# Helper function
def save_results(method, list_of_results): 

    out_file= open('../results/'+method+'_evaluation.txt', 'a')
    out_file.write('\n evaluation results of '+method+' (accuracy, precision, recall, f1, purity) on '+datetime.now()+'\n')
    for value in list_of_results:
        out_file.write(value+'\t')
    out_file.close()

### b) KG Embedding:
In this step, we train different KG embedding models, namely, transE, DistMult, rotatE, and ConEX to represent entities and their relations as low-dimensional semantic vectors. We use DAIKIRI-embedding libratry for computed KG models. For our experiments, we provide our pre-trained model in 'data/pre-trained'

In [None]:
PATH_TRANS_E = 'data/pre-trained/transE_fb15k_256dim.pkl' 
BASE_PATH_TRUTH = 'data/FB15k-237'

with open(PATH_TRANS_E, "rb") as fin:
    model = pickle.load(fin)
    
entity2id = model.graph.entity2id
relation2id = model.graph.relation2id

entity_embeddings = model.solver.entity_embeddings
relation_embeddings = model.solver.relation_embeddings

entity_embeddings.shape

#extract ground-truth types (e.g., film, person, organization, country, actor):
fb_train=pd.read_csv(BASE_PATH_TRUTH + '/train.txt', sep='\t', header=None, index_col=0)
fb_valid=pd.read_csv(BASE_PATH_TRUTH + '/valid.txt', sep='\t', header=None, index_col=0)
fb_test=pd.read_csv(BASE_PATH_TRUTH + '/test.txt', sep='\t', header=None, index_col=0)

fb_df=pd.concat([fb_train, fb_valid, fb_test])
fb_df['type']= fb_df[1].apply(lambda x: x.split('/')[1])

#combine entities with their types:
ground_truth={}
for entity_id in entity2id.keys():
    if entity_id in fb_df.index:
        if isinstance(fb_df.loc[entity_id, 'type'], pd.core.series.Series): 
            ground_truth[entity_id]=fb_df.loc[entity_id, 'type'][0]
        else:
            ground_truth[entity_id]=fb_df.loc[entity_id, 'type']
    else:
        ground_truth[entity_id]='unknown' # for missed types

##### filter top-k types from FB15k-237 dataset, we set k=6 in our experiments:

In [None]:
#filter commen types from FB15k-237 dataset:
entity_embedding_filter=[]
y_true_filter=[]

top_types=['people', 'film', 'location', 'music', 'soccer', 'education']

for k, value in ground_truth.items():
    if value in top_types:        
        entity_embedding_filter.append(entity_embeddings[entity2id[k]])
        y_true_filter.append(value)
        
X_all = np.asarray(entity_embedding_filter)

#encode y_labels as one-hot:
encoder = LabelEncoder()
y_all = encoder.fit_transform(y_true_filter)
labels = encoder.classes_.tolist()

### c) Entities Clustering:
* In this step, we group entities with similar properites (i.e., based on their embedding representations) into clusters. Each group should have similar entities --> similar types.

* We employ different clustering approachs (k-means, hdbscan, agglomerative) for our experiments to find which method yields best in entity typing task.

#### HDBSCAN:

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

HDBSCAN requires two main hyper-parameters: 1) eplsion, which specify the area within it, there should be a min_samples to consider a point a core point. We use the eplow approach to find a best value for epslion.

In [None]:
from sklearn.neighbors import NearestNeighbors

# final optimal value for cluster epsilon
neigh = NearestNeighbors(n_neighbors=5)
nbrs = neigh.fit(X_all)
distances, indices = nbrs.kneighbors(X_all)

distances = np.sort(distances, axis=0)
distances = distances[:,-1]
plt.plot(distances)

In [None]:
%%time
import hdbscan
from sklearn.metrics.pairwise import pairwise_distances

# compute the distance between entities using cosine
X_all_double=X_all.astype(np.double)
distance_matrix = pairwise_distances(X_all_double, metric='cosine')

hdbscan_clusterer=hdbscan.HDBSCAN(algorithm='best', alpha=0.1, metric='precomputed', cluster_selection_method='leaf',
                                      min_samples=10, min_cluster_size=700, core_dist_n_jobs=-1,allow_single_cluster=True,
                                      cluster_selection_epsilon=0.9)

hdbscan_clusterer.fit(distance_matrix)

cluster_labels= hdbscan_clusterer.labels_
cluster_probabilities=hdbscan_clusterer.probabilities_

##### Entity Typing: 

In the following, we present our strategy to select entities based on its distance to the cluster centroid
* We compute the cluster probabilies for all entities (cluster_probabilities). For each cluster, we select entities with high values >= 0.9 for labeling. 
* We present the selected entities (with their RDF triples) to human expers for labeling.
* Finally, we propagate the most frequent type in each cluster to  all entities.

In [None]:
# propagate the most frequent type in the cluster to all entities. 
df_tmp = pd.DataFrame({'pred_hdbscan': y_hdbscan, 'y_all': y_all})
pred_hdbscan = df_tmp.groupby('pred_hdbscan').transform(lambda x: x.mode().iloc[0]).to_numpy().reshape(-1)

In [None]:
# t-SNE visualization of entity typing.
plt.figure(figsize=(6, 5))
X_2d = TSNE(random_state=42).fit_transform(X_all)
label_ids = range(len(labels))
colors=['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown']

for i, c, label in zip(label_ids, colors, labels):    
    plt.scatter(X_2d[pred_hdbscan == i, 0], X_2d[pred_hdbscan == i, 1], c=c, label=label, s=1)

plt.legend()    
plt.savefig('/src/Figures/fb15k-transE-hdbscan.png', dpi=600, bbox_inches='tight',pad_inches=0)    
plt.show()

In [None]:
##### Evaluation of HDBSCAN Clustering: 

accuracy = accuracy_score(y_all, pred_hdbscan)
precision = precision_score(y_all, pred_hdbscan, zero_division=0, average='weighted')
recall = recall_score(y_all, pred_hdbscan, average='weighted')
f1 = f1_score(y_all, pred_hdbscan, average='weighted')
purity= evaluator.purity_score(y_true=y_all, y_pred=pred_hdbscan)

# save results into file
save_results(method='hdbscan', list_of_results=[accuracy, precision, recall, f1, purity])

#### K-Means:

In [None]:
from sklearn.cluster import KMeans

%%time
kmeans = KMeans(n_clusters=len(top_types), random_state=35).fit(X_all)
pred_kmeans = kmeans.predict(X_all)


##### Entity Typing: 
- Similar to hdbscan, we repeat the same evaluation of entity typing for K-means

In [None]:
# majority voting per cluster
df_tmp = pd.DataFrame({'pred_Kmeans': pred_kmeans, 'y_all': y_all})
pred_kmeans = df_tmp.groupby('pred_Kmeans').transform(lambda x: x.mode().iloc[0]).to_numpy().reshape(-1)

In [None]:
##### Evaluation of K-means Clustering: 
plt.figure(figsize=(6, 5))

for i, c, label in zip(label_ids, colors, labels):    
    plt.scatter(X_2d[pred_kmeans == i, 0], X_2d[pred_kmeans == i, 1], c=c, label=label, s=1)

plt.legend()    
plt.savefig('/src/Figures/fb15k-transE-Kmeans.png', dpi=600, bbox_inches='tight',pad_inches=0)
plt.show()

In [None]:
##### Evaluation of Kmeans Clustering: 
accuracy = accuracy_score(y_all, pred_kmeans)
precision = precision_score(y_all, pred_kmeans, zero_division=0, average='weighted')
recall = recall_score(y_all, pred_kmeans, average='weighted')
f1 = f1_score(y_all, pred_kmeans, average='weighted')
purity= evaluator.purity_score(y_true=y_all, y_pred=pred_kmeans)

# save results into file
save_results(method='kmeans', list_of_results=[accuracy, precision, recall, f1, purity])

#### Agglomerative:

In [None]:
from sklearn.cluster import AgglomerativeClustering

%%time
aggClustering = AgglomerativeClustering(n_clusters=len(top_types))
y_aggClustering=aggClustering.fit_predict(X_all)

##### Entity Typing: 
- Similar to hdbscan, we repeat the same evaluation of entity typing for Agglomerative

In [None]:
# majority voting per cluster
df_tmp = pd.DataFrame({'pred_agglom': y_aggClustering, 'y_all': y_all})
pred_agglom = df_tmp.groupby('pred_agglom').transform(lambda x: x.mode().iloc[0]).to_numpy().reshape(-1)

In [None]:
##### Evaluation of K-means Clustering: 
plt.figure(figsize=(6, 5))

for i, c, label in zip(label_ids, colors, labels):    
    plt.scatter(X_2d[pred_kmeans == i, 0], X_2d[pred_agglom == i, 1], c=c, label=label, s=1)

plt.legend()    
plt.savefig('/src/Figures/fb15k-transE-Agglomerative.png', dpi=600, bbox_inches='tight',pad_inches=0)
plt.show()

In [None]:
##### Evaluation of Agglomerative Clustering: 
accuracy = accuracy_score(y_all, pred_agglom)
precision = precision_score(y_all, pred_agglom, zero_division=0, average='weighted')
recall = recall_score(y_all, pred_agglom, average='weighted')
f1 = f1_score(y_all, pred_agglom, average='weighted')
purity= evaluator.purity_score(y_true=y_all, y_pred=pred_agglom)

# save results into file
save_results(method='Agglomerative', list_of_results=[accuracy, precision, recall, f1, purity])