In [1]:
import os 
os.chdir('../../')

In [13]:
import pandas as pd

os.chdir('./Clustering')
from clustering_evaluation import ClusterPurity
evaluator=ClusterPurity()

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt 

## Load pre-trained dbpedia embeddings using rdf model: 

In [3]:
os.chdir('../')
#--- http://data.dws.informatik.uni-mannheim.de/rdf2vec/models/DBpedia/2016-04/GlobalVectors/9_pageRank/ --
dbpedia_rdf2vec = pd.read_csv('DBpediaVecotrs200_20Shuffle.txt', sep=" ", header=None, index_col=0)

# -- sample the dataset
dbpedia_sampled= dbpedia_rdf2vec.sample(frac=0.1)
dbpedia_sampled.sample(frac=0.1)
dbpedia_sampled.shape

(887668, 200)

# Clustering DBpedia:

## Query DBpedia to retrieve entity types:

In [4]:
from rdflib import Graph

g = Graph()
g.parse("./instance_types_en.ttl", format="turtle")


# query dbpedia to get the type of entities 
def query_dbpedia(entity_uri):
    qres=g.query(""" SELECT ?o WHERE { """+entity_uri+""" ?p ?o .} """)
    for row in qres:
        return str(row.o).split("/")[-1]
    
entities_uri=dbpedia_sampled.index.values.tolist()
entities_types=[]

for index, row in  dbpedia_sampled.iterrows():
    type_="NoType" # default value    
    type_=query_dbpedia(index)        
    
    entities_types.append(type_)        
    
entities_types2=['None' if ent_type is None else ent_type for ent_type in entities_types]        

In [5]:
print (len(entities_types2))

887668


# Unsupervisied Approaches (Clustering): 

## K-Mean:

In [None]:
kmeans = KMeans(n_clusters=400, random_state=42).fit(dbpedia_sampled.values)
y_predicted=kmeans.predict(dbpedia_sampled.values)

In [15]:
# binarize string_labels in ['type']
lb=LabelEncoder()
entitiy_Types=lb.fit_transform(entities_types2)

In [12]:
#------------- Evaluation based on cluster_purity metric: -----------------#
purity_score=evaluator.purity_score(y_true=entitiy_Types, y_pred=y_predicted)
print('Clustering Purity Score: ', purity_score)

#----------- Evaluation based on Precision, Recall, Accuracy and F1-score: -------#
accuracy = accuracy_score(entitiy_Types, y_predicted)
print('Accuracy: %f' % accuracy)

precision = precision_score(entitiy_Types, y_predicted, average='macro')
print('Precision: %f' % precision)

recall = recall_score(entitiy_Types, y_predicted, average='macro')
print('Recall: %f' % recall)

f1 = f1_score(entitiy_Types, y_predicted, average='macro')
print('F1 score: %f' % f1)

Clustering Purity Score:  0.7212707904306566
Accuracy: 0.001626
Precision: 0.003165
Recall: 0.001351
F1 score: 0.000687


In [16]:
# ------------ plot confusion matrix -------------#
#labels=[0, 1]
cm= confusion_matrix(event_labels, y_predicted, event_labels, normalize='all')
ax= plt.subplot()

sns.heatmap(cm, annot=True, ax = ax, cmap="YlGnBu"); #annot=True to annotate cells

# # labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels') 
ax.set_title('Confusion Matrix') 



MemoryError: Unable to allocate 5.73 TiB for an array with shape (887668, 887668) and data type int64

---

## KNN: 

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split


In [13]:
X_train, X_test, y_train, y_test = train_test_split( dbpedia_sampled.values, entitiy_Types, test_size=0.33, random_state=42)

In [14]:

neigh = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)

y_predicted = neigh.predict(X_test)

In [15]:
#------------- Evaluation based on cluster_purity metric: -----------------#
purity_score=evaluator.purity_score(y_true=y_test, y_pred=y_predicted)
print('Clustering Purity Score: ', purity_score)

#----------- Evaluation based on Precision, Recall, Accuracy and F1-score: -------#
accuracy = accuracy_score(y_test, y_predicted)
print('Accuracy: %f' % accuracy)

precision = precision_score(y_test, y_predicted, average='macro')
print('Precision: %f' % precision)

recall = recall_score(y_test, y_predicted, average='macro')
print('Recall: %f' % recall)

f1 = f1_score(y_test, y_predicted, average='macro')
print('F1 score: %f' % f1)

Clustering Purity Score:  0.8515350031236025
Accuracy: 0.846049
Precision: 0.410845


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.403252
F1 score: 0.400816


---

## DBSCAN:

In [6]:
import hdbscan

In [None]:
##-- Cluster the data using HDBSCAN --### -- Consider Hyperparameter tuning later --#
clusterer = hdbscan.HDBSCAN(algorithm='best', alpha=1.3, approx_min_span_tree=True, metric='euclidean',
                            gen_min_span_tree=True, min_cluster_size=10000, min_samples=1000, cluster_selection_epsilon= 0.5, 
                            core_dist_n_jobs=4,allow_single_cluster=False).fit(dbpedia_sampled)

In [None]:
y_predicted = clusterer.labels_

In [11]:
print ('Check if the above cell finished?!')

Check if the above cell finished?!


In [16]:
#------------- Evaluation based on cluster_purity metric: -----------------#
purity_score=evaluator.purity_score(y_true=entitiy_Types, y_pred=y_predicted)
print('Clustering Purity Score: ', purity_score)

#----------- Evaluation based on Precision, Recall, Accuracy and F1-score: -------#
accuracy = accuracy_score(entitiy_Types, y_predicted)
print('Accuracy: %f' % accuracy)

precision = precision_score(entitiy_Types, y_predicted, average='macro')
print('Precision: %f' % precision)

recall = recall_score(entitiy_Types, y_predicted, average='macro')
print('Recall: %f' % recall)

f1 = f1_score(entitiy_Types, y_predicted, average='macro')
print('F1 score: %f' % f1)

Clustering Purity Score:  0.46439659872835337
Accuracy: 0.000095


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.000008


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.001520
F1 score: 0.000017


# Supervisied Approaches:

## Logistic Regression:

In [None]:
logistic_clf = LogisticRegression(solver='liblinear',random_state=42).fit(X_train, y_train)
y_predicted = logistic_clf.predict(X_test)

In [None]:
#------------- Evaluation based on cluster_purity metric: -----------------#
purity_score=evaluator.purity_score(y_true=event_labels, y_pred=y_predicted)
print('Clustering Purity Score: ', purity_score)

#----------- Evaluation based on Precision, Recall, Accuracy and F1-score: -------#
accuracy = accuracy_score(event_labels, y_predicted)
print('Accuracy: %f' % accuracy)

precision = precision_score(event_labels, y_predicted, average='macro')
print('Precision: %f' % precision)

recall = recall_score(event_labels, y_predicted, average='macro')
print('Recall: %f' % recall)

f1 = f1_score(event_labels, y_predicted, average='macro')
print('F1 score: %f' % f1)

## RandomForest: 

In [None]:
random_clf = RandomForestClassifier(max_depth=2, random_state=42).fit(X_train, y_train)
y_predicted= random_clf.predict(X_test)

In [None]:
#------------- Evaluation based on cluster_purity metric: -----------------#
purity_score=evaluator.purity_score(y_true=event_labels, y_pred=y_predicted)
print('Clustering Purity Score: ', purity_score)

#----------- Evaluation based on Precision, Recall, Accuracy and F1-score: -------#
accuracy = accuracy_score(event_labels, y_predicted)
print('Accuracy: %f' % accuracy)

precision = precision_score(event_labels, y_predicted, average='macro')
print('Precision: %f' % precision)

recall = recall_score(event_labels, y_predicted, average='macro')
print('Recall: %f' % recall)

f1 = f1_score(event_labels, y_predicted, average='macro')
print('F1 score: %f' % f1)

## SVM:

In [None]:
from sklearn.svm import SVC

In [None]:
svn_classifier=SVC(gamma='auto')