In [1]:
!cp "/content/drive/MyDrive/Ecole/Internship/data/aikg.zip" "/content/"
!unzip aikg.zip

Archive:  aikg.zip
  inflating: aikg.ttl                


In [2]:
# Retrieving relations from ttl file
!grep rdf:predicate aikg.ttl -A 1 -B 1 > aikg_relations.txt

In [1]:
!head -n 10 aikg_relations.txt

    rdf:object aikg:information_overload ;
    rdf:predicate aikg-ont:supportsOtherEntity ;
    rdf:subject aikg:recommender_system ;
--
    rdf:object aikg:km ;
    rdf:predicate aikg-ont:taskSupportedBy ;
    rdf:subject aikg:knowledge_sharing ;
--
    rdf:object aikg:boosting ;
    rdf:predicate aikg-ont:methodUsedBy ;


In [1]:
i = 0
objects = []
subjects = []
predicates = []

with open('aikg_relations.txt', 'r') as fp:
    for line in fp:
        if line.startswith('--'):
            pass
        elif i == 0:
            objects.append(line.split(':')[2][:-3])
            i += 1
        elif i == 1:
            predicates.append(line.split(':')[2][:-3])
            i += 1
        elif i == 2:
            subjects.append(line.split(':')[2][:-3])
            i = 0

In [2]:
len(objects), len(subjects), len(predicates)

(2235820, 2235820, 2235820)

In [3]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [4]:
df = pd.DataFrame({'object': objects, 
                   'subject': subjects, 
                   'predicate': predicates})
df.head()

Unnamed: 0,object,subject,predicate
0,information_overload,recommender_system,supportsOtherEntity
1,km,knowledge_sharing,taskSupportedBy
2,boosting,adaboost,methodUsedBy
3,image_segmentation,object_recognition,otherEntityUsedBy
4,knowledge_representation,semantic_network,methodUsedBy


In [5]:
df['predicate'].value_counts()

methodUsedBy                                460724
usesMethod                                  460724
usesOtherEntity                             241004
includesOtherEntity                         144755
otherEntityUsedBy                           136310
OtherEntityIncludedBy                       113678
//www.w3.org/2004/02/skos/core#narrower>    107812
//www.w3.org/2004/02/skos/core#broader>     107812
OtherEntityUsedBy                           104694
materialUsedBy                               41075
usesMaterial                                 41075
includesMethod                               34616
methodIncludedBy                             30332
OtherEntityPredictedBy                       29382
taskUsedBy                                   22341
usesTask                                     22341
methodEvaluatedBy                            18622
evaluatesMethod                              17954
includesMaterial                             14317
materialIncludedBy             

In [6]:
# Rename hypernym values
df.loc[df['predicate']=='//www.w3.org/2004/02/skos/core#broader>', 'predicate'] = 'hypernym'

# Filter only right predicates
good_predicates = ['methodUsedBy', 'otherEntityUsedBy', 'OtherEntityIncludedBy',
                   'hypernym', 'materialUsedBy', 'methodIncludedBy', 'taskUsedBy', 
                   'methodEvaluatedBy', 'materialIncludedBy', 'OtherEntityPredictedBy']
df = df[df['predicate'].isin(good_predicates)]

In [7]:
df['predicate'].value_counts()

methodUsedBy              460724
otherEntityUsedBy         136310
OtherEntityIncludedBy     113678
hypernym                  107812
materialUsedBy             41075
methodIncludedBy           30332
OtherEntityPredictedBy     29382
taskUsedBy                 22341
methodEvaluatedBy          18622
materialIncludedBy         11295
Name: predicate, dtype: int64

In [8]:
def clean_word(word):
    word = word.replace('_', ' ')
    return word

In [9]:
df['clean_subject'] = df['subject'].apply(lambda x: clean_word(x))
df['clean_object'] = df['object'].apply(lambda x: clean_word(x))

In [10]:
df.shape

(971571, 5)

In [11]:
le = LabelEncoder()
df['predicate_num'] = le.fit_transform(df['predicate'].values)
df.head()

Unnamed: 0,object,subject,predicate,clean_subject,clean_object,predicate_num
2,boosting,adaboost,methodUsedBy,adaboost,boosting,7
3,image_segmentation,object_recognition,otherEntityUsedBy,object recognition,image segmentation,8
4,knowledge_representation,semantic_network,methodUsedBy,semantic network,knowledge representation,7
8,decision_making_process,collaborative_bi,methodUsedBy,collaborative bi,decision making process,7
10,model_oriented_formalism,safety_constraint,otherEntityUsedBy,safety constraint,model oriented formalism,8


ConceptNet Numberbatch embeds

In [12]:
!wget https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz
!gunzip numberbatch-en-19.08.txt.gz

--2022-05-10 21:24:46--  https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz
Resolving conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)... 52.217.111.188
Connecting to conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)|52.217.111.188|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 325403502 (310M) [application/x-gzip]
Saving to: ‘numberbatch-en-19.08.txt.gz’


2022-05-10 21:25:06 (16.6 MB/s) - ‘numberbatch-en-19.08.txt.gz’ saved [325403502/325403502]

gzip: numberbatch-en-19.08.txt already exists; do you wish to overwrite (y or n)? ^C


In [20]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

def apply_embed_model(model, line, dim=300):
    result = np.zeros((dim,))
    words = line.split(' ')

    for w in words:
        if w in model:
            result += model[w]
        else:
            return None
    
    return result / len(words)

In [14]:
conceptnet_model = load_embeddings("numberbatch-en-19.08.txt")

In [15]:
df['object_cn_embed'] = df['clean_object'].apply(lambda x: apply_embed_model(conceptnet_model, x))
df['subject_cn_embed'] = df['clean_subject'].apply(lambda x: apply_embed_model(conceptnet_model, x))

df.head()

Unnamed: 0,object,subject,predicate,clean_subject,clean_object,predicate_num,object_cn_embed,subject_cn_embed
2,boosting,adaboost,methodUsedBy,adaboost,boosting,7,"[-0.12929999828338623, -0.0052999998442828655,...",
3,image_segmentation,object_recognition,otherEntityUsedBy,object recognition,image segmentation,8,"[-0.08009999990463257, 0.06159999954979867, -0...","[-0.11924999952316284, -0.0519499983638525, -0..."
4,knowledge_representation,semantic_network,methodUsedBy,semantic network,knowledge representation,7,"[-0.09414999932050705, -0.037450000643730164, ...","[-0.056700000539422035, 0.02445000084117055, -..."
8,decision_making_process,collaborative_bi,methodUsedBy,collaborative bi,decision making process,7,"[-0.12440000226100285, -0.047600001096725464, ...","[-0.018150000367313623, 0.023049998097121716, ..."
10,model_oriented_formalism,safety_constraint,otherEntityUsedBy,safety constraint,model oriented formalism,8,"[-0.07413333406051, 0.003700000544389089, -0.1...","[-0.10555000230669975, -0.04729999974370003, -..."


In [16]:
df.isna().sum()

object                   0
subject                  0
predicate                0
clean_subject            0
clean_object             0
predicate_num            0
object_cn_embed      69280
subject_cn_embed    112295
dtype: int64

In [17]:
df_conceptnet = df.dropna()

In [18]:
df_conceptnet['predicate'].value_counts()

methodUsedBy              366720
otherEntityUsedBy         118836
OtherEntityIncludedBy     101854
hypernym                   80425
materialUsedBy             34870
OtherEntityPredictedBy     28809
methodIncludedBy           25433
taskUsedBy                 20477
methodEvaluatedBy          14053
materialIncludedBy         10104
Name: predicate, dtype: int64

In [19]:
df_conceptnet_train, df_conceptnet_valid = train_test_split(df_conceptnet, 
                                                            test_size=0.1, 
                                                            random_state=42)
df_conceptnet_train.shape, df_conceptnet_valid.shape

((721422, 8), (80159, 8))

In [None]:
word1_embeds = np.stack(df_conceptnet_train['object_cn_embed'].values)
word2_embeds = np.stack(df_conceptnet_train['subject_cn_embed'].values)

train_embeds = word1_embeds + word2_embeds
train_embeds_concat = np.concatenate((word1_embeds, word2_embeds), axis=1)

y_train = df_conceptnet_train['predicate_num'].values

In [None]:
word1_embeds = np.stack(df_conceptnet_valid['object_cn_embed'].values)
word2_embeds = np.stack(df_conceptnet_valid['subject_cn_embed'].values)

valid_embeds = word1_embeds + word2_embeds
valid_embeds_concat = np.concatenate((word1_embeds, word2_embeds), axis=1)

y_valid = df_conceptnet_valid['predicate_num'].values