In [1]:
import re, pandas, pickle, argparse
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import   hamming_loss

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

from collections import Counter

In [2]:
from skmultilearn.adapt import MLkNN
import sklearn.metrics as metrics

from skmultilearn.problem_transform import BinaryRelevance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## Hyper-parameters:

In [None]:
top_types= 10 # In our experiments, we use k= {3, 5, 10} top types.
Path='./data/YAGO43K-ET/'
Embedding_Path='./data/YAGO43K-ET/Preprocessed Files/YAGO43K-ConnectE.txt' 

labeled_size= 0.01 # size of labeled data, in our experiments we try 0.01, 0.1, 0.2, 0.9

seed= 42 # random_seed to reproduce our results

## Load Pre-traind Emebddings of YAGO43k Dataset:

In [None]:
# For loading ConnectE embedding model For YAGO3-10 Dataset
def load_embedding_model(Embedding_Path):
    ConnectE_embedding = {}
    
    with open(Embedding_Path,'r') as inf:
        for line in inf:
            lisplit=line.split('\t')
            ConnectE_embedding[lisplit[0]]=eval(lisplit[1])
    return ConnectE_embedding

In [3]:
# load pretrained embeddding, YAGO embedding.    
entity_embeddings = load_embedding_model(Embedding_Path)

 ## Lookup Entity Types in YAGO43K:
YAGO_ttrain = list(filter(None, [re.split('\t', i.strip('\n')) for i in open(PATH+'/YAGO43k_Entity_Type_train_clean.txt')]))
YAGO_ttest = list(filter(None, [re.split('\t', i.strip('\n')) for i in open(PATH+'/YAGO43k_Entity_Type_test_clean.txt')]))
YAGO_tvalid = list(filter(None, [re.split('\t', i.strip('\n')) for i in open(PATH+'/YAGO43k_Entity_Type_valid_clean.txt')]))

YAGO_types=YAGO_ttrain+YAGO_ttest+YAGO_tvalid

Types_df = pandas.DataFrame(YAGO_types)
Types_df=Types_df.set_index([0]) 

Entities_Groups = Types_df.groupby(0).agg(lambda x: list(x)) # entities with multiple types

In [None]:
def filter_entities_topTypes(embedding_vec, y_dict, top_types):
    
    # flatten y_true (list of lists) into one list to count the most frequent types.
    y_true=list(y_dict.values())
    y_true_flatten=sum(y_true, [])

    # count top_types in FB15k dataset.
    top_types=[key for key, _ in Counter(y_true_flatten).most_common(top_types)]

    #filter embeddings for  top_types entities
    entity_embedding_filter={}
    y_true_filter={}

    for ent, ttype in y_dict.items(): # y_true is a multi-label types (list of list)
            
        for ent_tt in ttype: 
                
            if ent_tt in top_types:
                entity_embedding_filter[ent]= embedding_vec[ent] # get the emb vec for entity
                    
                if ent in y_true_filter:
                    y_true_filter[ent]+= [ent_tt]
                else:
                    y_true_filter[ent]= [ent_tt]
                        
    X_all=np.array(list(entity_embedding_filter.values()))
    return X_all, y_true

In [None]:
# filtered vectors 42k out of 123k entities from YAGO dataset... 
embedding_vec={} 
y_dict={}

for ent in entity_embeddings:
    
    if ent in entity_embeddings:
        embedding_vec[ent]=  entity_embeddings[ent] 
        y_dict[ent]= Entities_Groups.loc[[ent]][1][0]
        
X_all, y_all= filter_entities_topTypes(embedding_filter, y_dict, top_types)        

***

## Preprocess labels as one-hot

In [9]:
label_encoder = preprocessing.MultiLabelBinarizer()
y_encoded=label_encoder.fit_transform(list(y_true_filter.values()))
labels = label_encoder.classes_.tolist()

## Helper Function: Print Evaluation Results:

In [10]:
def evaluation_results(y_test, y_pred):
    #----------- Evaluation based on Precision, Recall, Accuracy and F1-score: -------#
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average='samples')
    recall = metrics.recall_score(y_test, y_pred, average='samples')
    f1 = metrics.f1_score(y_test, y_pred, average='samples')
    Hloss= hamming_loss(y_test,  y_pred)
    print("Evaluation results (acc. prec. rec. F1 & Hloss)-- Examples-based Averaged:\n\n{:.3f} & {:.3f} & {:.3f} & {:.3f} & {:.3f}".format(accuracy, precision, recall, f1, Hloss))

***

## Dataset Split: train-valid-test:

In [136]:
# train-valid-test dataset split into label-unlabelled (x_,x_u), (y_l, y_u)
x_l, x_u, y_l, y_u = train_test_split(X_all, y_encoded,  train_size=labeled_size, random_state=seed)

# split the dataset B into test & valid sets
x_valid, x_test, y_valid, y_test=train_test_split(x_u, y_u, train_size=labeled_size, random_state=seed)
print ("Size of data: train-valid-test" ,   x_l.shape[0], x_valid.shape[0], x_test.shape[0])   

((3143, 200), (2829, 200), (25466, 200))

## Baselines: (Logistic Regressions, RandomForest, Embeddings, and DNN)

In [None]:
# Train model
BATCH_SIZE = 128
EPOCHS = 100
n_patience=3
early= EarlyStopping(monitor='val_loss', mode='min', patience=n_patience, restore_best_weights=True) 

### 1) DNN baseline:

In [None]:
DNN_model=keras.Sequential(name="DNNModel")
DNN_model.add(layers.Dense(128, activation='relu', input_shape=(X_all.shape[1],)))
DNN_model.add(layers.Dense(y_encoded.shape[1], activation='sigmoid'))
DNN_model.summary()

# Compile model
DNN_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# train the DNN baseline
DNN_model.fit(A_x, A_y, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=[early], validation_data=(B_valid, y_valid), verbose=0)

# print evaluation of DNN
DNN_pred[DNN_pred <= 0.5] = 0
DNN_pred[DNN_pred > 0.5] = 1
DNN_pred=DNN_pred.astype(np.int64)

evaluation_results(y_test, DNN_pred)

## 2) Logisitc Regression:

In [None]:
Lr_MCLF = BinaryRelevance(LogisticRegression(solver='liblinear'))
Lr_MCLF.fit(A_x, A_y)
y_lr=Lr_MCLF.predict(B_test)

evaluation_results(y_test, y_lr.toarray())

## 3) RandomForest Baseline:

In [None]:
rf_MCLF = BinaryRelevance(RandomForestClassifier(random_state=seed))
rf_MCLF.fit(A_x, A_y)
y_rf=rf_MCLF.predict(B_test)
evaluation_results(y_test, y_rf.toarray())

## 4) Embedding (KNN) Baseline: 

In [None]:
parameters = {'k': range(1,1), 's': [0.5, 0.7, 1.0]}

clf = GridSearchCV(MLkNN(), parameters, n_jobs= -1)
clf.fit(A_x, A_y)
y_KNN=clf.predict(B_test)
evaluation_results(y_test, y_KNN.toarray())

--- 

## Our Approach (Teacher-Student Algorithm)

- We use the best baseline as an initial teacher model to generate pseudo-labels for the unlabeled data
- We train the student model on the labeled&pseudo-labeld data.
- Replace the teacher model with student model and generate new pseudo-labeled data. 
- Repeat until the student model is converaged.

### The structure of student model:

In [111]:
studentModel=keras.Sequential(name="StudentModel")
studentModel.add(layers.Dense(128, activation='relu', input_shape=(X_all.shape[1],)))
studentModel.add(layers.Dropout(rate=0.25))
studentModel.add(layers.Dense(y_encoded.shape[1], activation='sigmoid'))
studentModel.summary()

studentModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [112]:
n_iterations=5

# get the pseudo-labels initially from the teacher model 
teacher_pred=DNN_model.predict(B_test)

for i in range(n_iterations):

    ## Train the Student Model on the Labeled Dataset:
    studentModel.fit(A_x, A_y, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=[early], validation_data=(B_valid, y_valid), verbose=0)

    ## Train the student model on the pseudo-labeled data:
    studentModel.fit(B_test, teacher_pred, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=[early], validation_data=(B_valid, y_valid), verbose=0)

    # We use the DNN baseline as our initial teacher.
    teacher_pred=studentModel.predict(B_test) 


teacher_pred[teacher_pred <= 0.5] = 0
teacher_pred[teacher_pred > 0.5] = 1
teacher_pred=teacher_pred.astype(np.int64)
evaluation_results(y_test, teacher_pred)